Skip to content

Commit

Permalink
utf-8 without BOM now by default
Browse files Browse the repository at this point in the history
  • Loading branch information
ddooley committed Feb 4, 2025
1 parent ea47cdc commit 5ba4694
Showing 1 changed file with 42 additions and 26 deletions.
68 changes: 42 additions & 26 deletions lib/utils/files.js
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,6 @@ function writeWorkbook(workbook, baseName, ext, opt = {}) {
* Download matrix to file.
* Note that BOM and UTF-8 can create problems on some systems when importing
* file. See "Supported Output Formats" and "UTF-16 Unicode Text" sections of
* https://reactian.com/sheetjs-community-edition-spreadsheet-data-toolkit/
* and https://github.com/SheetJS/sheetjs
* Solution at bottom of: https://github.com/SheetJS/sheetjs/issues/943
* The "Comma Separated Values" format is actually UTF-8 with BOM prefix.
Expand All @@ -167,7 +166,6 @@ export function exportWorkbook(workbook, baseName, ext) {
const worksheet = workbook.Sheets[sheetName];
const fileName = `${baseName}${sheets.length > 1 ? `_${sheetName}` : ''}.${ext.split(' ')[0]}`;
var data = '';

switch (ext) {
case 'xlsx':
case 'xls':
Expand All @@ -179,52 +177,74 @@ export function exportWorkbook(workbook, baseName, ext) {
See
- https://docs.sheetjs.com/docs/api/write-options/
- https://docs.sheetjs.com/docs/api/utilities/csv#csv-output
* We want more accurate mimeTypes, so saveBlob() allows this.
saveBlob() enables more accurate mimeTypes?
* writeFile(bookType: 'csv'...) output includes the UTF-8 byte order
* mark ("BOM").
* writeFile(bookType: 'tsv'...) output will NOT include the BOM ???
* sheet_to_csv() will return JavaScript strings without the UTF-8 BOM.
* sheet_to_txt(): If encoding support is available, the output will be
* encoded in CP1200 and the UTF-16 BOM will be added. If encoding
* support is not available, the output will be encoded as a standard
* string.
* So is encoding support available?
*/
case 'csv':

/* Phasing this out. UTF-8 doesn't need a BOM
case 'csv': // UTF-8
// writeFile(workbook, fileName, {bookType: 'csv', FS: ','});
data = XlsxUtils.sheet_to_csv(worksheet, {FS: ','});
data = '\uFEFF' + data; //BOM
saveBlob(data, fileName, 'text/plain;charset=UTF-8');
break;
*/

/* This case won't work until we convert data to UTF-16
case 'csv (UTF-16)':
//writeFile(workbook, fileName, {bookType: 'txt', FS: ','});
data = XlsxUtils.sheet_to_csv(worksheet, {FS: ','});
data = '\uFEFF' + data; //BOM
saveBlob(data, fileName, 'text/plain;charset=UTF-16LE');
break;
*/

case 'csv':
case 'csv (UTF-8, no BOM)':
data = XlsxUtils.sheet_to_csv(worksheet, {FS: ','});
saveBlob(data, fileName, 'text/plain;charset=UTF-8');
break;

/* This case won't work until we convert data to ASCII
case 'csv (ASCII)': // no BOM
data = XlsxUtils.sheet_to_csv(worksheet, {FS: ','});
saveBlob(data, fileName, 'text/plain;charset=us-ascii');
break;
*/

case 'tsv': // BOM version
/*
* https://stackoverflow.com/questions/8336355/what-exactly-is-unicode-codepage-1200
* sheet_to_txt(): sheetjs notes: "If encoding support is available, the
* output will be encoded in CP1200 and the UTF-16 BOM will be added. If
* encoding support is not available, the output will be encoded as a
* standard string." In DH tests it seems "encoding support" is not
* available, and resulting file is UTF-8 +BOM anyways.
*/
case 'tsv': // UTF-8 BOM version
// SheetJS note: For compatibility with Excel, csv output will always
// include the UTF-8 byte order mark ("BOM").
//writeFile(workbook, fileName, {bookType: 'csv', FS: '\t'});
data = XlsxUtils.sheet_to_csv(worksheet, {FS: '\t'});
data = '\uFEFF' + data; //BOM
saveBlob(data, fileName, 'text/plain;charset=UTF-8');
break;

case 'tsv (UTF-16)':
/* Not working, produces hexidecimal file - is charset="UTF-16LE" recognized?
* See Table 2-4: unicode.org/versions/Unicode6.0.0/ch02.pdf"
* UTF-16 little endian, aka code page 1200, is not permitted to have a BOM,
* according to the Unicode standard.
* DATA NEEDS TO BE CONVERTED TO UTF-16
*
case 'tsv (UTF-16)': // no BOM
// See: https://localizely.com/character-encodings/utf16le/
//writeFile(workbook, fileName, {bookType: 'tsv', FS: '\t'});
data = XlsxUtils.sheet_to_txt(worksheet, {FS: '\t'});
saveBlob(data, fileName, 'text/plain;charset=UTF-16LE');
break;

case 'csv (UTF-8, no BOM)':
data = XlsxUtils.sheet_to_csv(worksheet, {FS: ','});
saveBlob(data, fileName, 'text/plain;charset=UTF-8');
break;

case 'csv (ASCII)': // no BOM
data = XlsxUtils.sheet_to_csv(worksheet, {FS: ','});
saveBlob(data, fileName, 'text/plain;charset=us-ascii');
break;
*/
}
})
};
Expand Down Expand Up @@ -253,17 +273,13 @@ function saveBlob(
}
*/

// https://docs.sheetjs.com/docs/api/utilities/csv#csv-output
// "If encoding support is available, the output will be encoded in CP1200 and the UTF-16 BOM will be added. If encoding support is not available, the output will be encoded as a standard string.""

// Enhancing with mimeType
const blob = new Blob([data], { type: mimeType });
saveAs(blob, fileName);
};

// TODO: refactor to export matrix
export function exportFile(matrix, baseName, ext) {
console.log("running exportFile", matrix, baseName, ext)
const worksheet = XlsxUtils.aoa_to_sheet(matrix);
const workbook = XlsxUtils.book_new();
XlsxUtils.book_append_sheet(workbook, worksheet, DEFAULT_SHEETNAME);
Expand Down

0 comments on commit 5ba4694

Please sign in to comment.