From 6f008bdd9d3c7017f545f37b2bcc97f11660b18c Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 11 Apr 2024 09:40:56 -0700 Subject: [PATCH] Automatic detection of dec (. or ,) (#4482) * initial progress on automatic dec=, detection * if sep=, detected, turn off auto-dec * first pass at NEWS and man * add comments, tests * improve man * add verbose output, tests --------- Co-authored-by: Michael Chirico --- NEWS.md | 2 ++ R/fread.R | 5 +++-- inst/tests/tests.Rraw | 36 +++++++++++++++++++++++-------- man/fread.Rd | 8 +++---- src/fread.c | 50 ++++++++++++++++++++++++++++++++++++++----- src/fread.h | 2 ++ src/freadR.c | 5 +++-- 7 files changed, 86 insertions(+), 22 deletions(-) diff --git a/NEWS.md b/NEWS.md index 4fa8d699b..27c35e385 100644 --- a/NEWS.md +++ b/NEWS.md @@ -32,6 +32,8 @@ 8. Computations in `j` can return a matrix or array _if it is one-dimensional_, e.g. a row or column vector, when `j` is a list of columns during grouping, [#783](https://github.com/Rdatatable/data.table/issues/783). Previously a matrix could be provided `DT[, expr, by]` form, but not `DT[, list(expr), by]` form; this resolves that inconsistency. It is still an error to return a "true" array, e.g. a `2x3` matrix. +9. `fread` now supports automatic detection of `dec` (as either `.` or `,`, the latter being [common in many places in Europe, Africa, and South America](https://en.wikipedia.org/wiki/Decimal_separator)); this behavior is now the default, i.e. `dec='auto'`, [#2431](https://github.com/Rdatatable/data.table/issues/2431). This was our #2 most-requested issue. See [#3189](https://github.com/Rdatatable/data.table/issues/3189) and please do peruse this list and show support to the issues that would help you the most as we continue to use this metric to help prioritize development. + ## BUG FIXES 1. `unique()` returns a copy the case when `nrows(x) <= 1` instead of a mutable alias, [#5932](https://github.com/Rdatatable/data.table/pull/5932). This is consistent with existing `unique()` behavior when the input has no duplicates but more than one row. Thanks to @brookslogan for the report and @dshemetov for the fix. diff --git a/R/fread.R b/R/fread.R index b2e55403d..66bda3fb1 100644 --- a/R/fread.R +++ b/R/fread.R @@ -1,5 +1,5 @@ fread = function( -input="", file=NULL, text=NULL, cmd=NULL, sep="auto", sep2="auto", dec=".", quote="\"", nrows=Inf, header="auto", +input="", file=NULL, text=NULL, cmd=NULL, sep="auto", sep2="auto", dec="auto", quote="\"", nrows=Inf, header="auto", na.strings=getOption("datatable.na.strings","NA"), stringsAsFactors=FALSE, verbose=getOption("datatable.verbose",FALSE), skip="__auto__", select=NULL, drop=NULL, colClasses=NULL, integer64=getOption("datatable.integer64","integer64"), col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, index=NULL, @@ -16,7 +16,8 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") else if (sep=="auto") sep="" # sep=="" at C level means auto sep else stopifnot( nchar(sep)==1L ) # otherwise an actual character to use as sep } - stopifnot( is.character(dec), length(dec)==1L, nchar(dec)==1L ) + stopifnot( is.character(dec), length(dec)==1L) + if (dec == "auto") dec = "" else stopifnot(nchar(dec) == 1L) # handle encoding, #563 if (length(encoding) != 1L || !encoding %chin% c("unknown", "UTF-8", "Latin-1")) { stopf("Argument 'encoding' must be 'unknown', 'UTF-8' or 'Latin-1'.") diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 405ccd0a0..d3a0e37e8 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -2681,15 +2681,13 @@ if (test_bit64) { test(897, class(DT$b), "integer64") test(898, fread(f), DT) unlink(f) - DT[,a2:=as.integer64(a)][,a3:=as.double(a)][,a4:=gsub(" ","",format(a))] - DT[,b2:=as.double(b)][,b3:=gsub(" ","",format(b))] - DT[,r:=a/100][,r2:=gsub(" ","",format(r))] - DT[112, a2:=as.integer64(12345678901234)] # start on row 112 to avoid the first 100 - DT[113, a3:=3.14] - DT[114, a4:="123A"] - DT[115, b2:=1234567890123.45] - DT[116, b3:="12345678901234567890A"] # A is needed otherwise read as double with loss of precision (TO DO: should detect and bump to STR) - DT[117, r2:="3.14A"] + DT[ , a2 := as.integer64(a)][112L, a2 := as.integer64(12345678901234)] # start on row 112 to avoid the first 100 + DT[ , a3 := as.double(a) ][113L, a3 := 3.14] + DT[ , a4 := as.character(a)][114L, a4 := "123A"] + DT[ , b2 := as.double(b) ][115L, b2 := 1234567890123.45] + DT[ , b3 := as.character(b)][116L, b3 := "12345678901234567890A"] # A is needed otherwise read as double with loss of precision (TO DO: should detect and bump to STR) + DT[ , r := a/100] + DT[ , r2 := as.character(r)][117L, r2 := "3.14A"] fwrite(DT,f<-tempfile()) test(899.1, fread(f, verbose=TRUE), DT, output="Rereading 6 columns.*out-of-sample.*Column 4.*a2.*int32.*int64.*<<12345678901234>>.*Column 10.*r2.*float64.*string.*<<3.14A>>") test(899.2, fread(f, colClasses=list(character=c("a4","b3","r2"), integer64="a2", double=c("a3","b2")), verbose=TRUE), @@ -18432,3 +18430,23 @@ DF <- structure( ) test(2255, as.data.table(DF), output="DF1.V1.*DF1.V2.*DF2.V3.*DF2.V4.*V5") + +# automatic detection of dec=',' for #2431 +DT = data.table(a = letters, b = 1:26/6, c = 1:26) +## auto-detect dec=',' +fwrite(DT, f <- tempfile(), dec=',', sep=';') +test(2256.1, fread(f), DT) + +fwrite(DT, f, dec=',', sep='|') +test(2256.2, fread(f), DT) + +## auto-detect dec='.' +fwrite(DT, f) +test(2256.3, fread(f), DT) + +## verbose output +test(2256.4, fread(f, verbose=TRUE), DT, output="sep=',' so dec set to '.'") + +fwrite(DT, f, dec=',', sep=';') +test(2256.5, fread(f, verbose=TRUE), DT, output="dec=',' detected based on a balance of 18") +test(2256.6, fread('a;b\n1,14;5', verbose=TRUE), data.table(a=1.14, b=5L), output="dec=',' detected based on a balance of 1 ") diff --git a/man/fread.Rd b/man/fread.Rd index 49b187364..d397a441d 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -9,7 +9,7 @@ \code{fread} is for \emph{regular} delimited files; i.e., where every row has the same number of columns. In future, secondary separator (\code{sep2}) may be specified \emph{within} each column. Such columns will be read as type \code{list} where each cell is itself a vector. } \usage{ -fread(input, file, text, cmd, sep="auto", sep2="auto", dec=".", quote="\"", +fread(input, file, text, cmd, sep="auto", sep2="auto", dec="auto", quote="\"", nrows=Inf, header="auto", na.strings=getOption("datatable.na.strings","NA"), # due to change to ""; see NEWS stringsAsFactors=FALSE, verbose=getOption("datatable.verbose", FALSE), @@ -47,7 +47,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC" If type coercion results in an error, introduces \code{NA}s, or would result in loss of accuracy, the coercion attempt is aborted for that column with warning and the column's type is left unchanged. If you really desire data loss (e.g. reading \code{3.14} as \code{integer}) you have to truncate such columns afterwards yourself explicitly so that this is clear to future readers of your code. } \item{integer64}{ "integer64" (default) reads columns detected as containing integers larger than 2^31 as type \code{bit64::integer64}. Alternatively, \code{"double"|"numeric"} reads as \code{utils::read.csv} does; i.e., possibly with loss of precision and if so silently. Or, "character". } - \item{dec}{ The decimal separator as in \code{utils::read.csv}. If not "." (default) then usually ",". See details. } + \item{dec}{ The decimal separator as in \code{utils::read.csv}. When \code{"auto"} (the default), an attempt is made to decide whether \code{"."} or \code{","} is more suitable for this input. See details. } \item{col.names}{ A vector of optional names for the variables (columns). The default is to use the header column if present or detected, or if not "V" followed by the column number. This is applied after \code{check.names} and before \code{key} and \code{index}. } \item{check.names}{default is \code{FALSE}. If \code{TRUE} then the names of the variables in the \code{data.table} are checked to ensure that they are syntactically valid variable names. If necessary they are adjusted (by \code{\link{make.names}}) so that they are, and also to ensure that there are no duplicates.} \item{encoding}{ default is \code{"unknown"}. Other possible options are \code{"UTF-8"} and \code{"Latin-1"}. Note: it is not used to re-encode the input, rather enables handling of encoded strings in their native encoding. } @@ -79,9 +79,9 @@ If an empty line is encountered then reading stops there with warning if any tex \bold{Line endings:} All known line endings are detected automatically: \code{\\n} (*NIX including Mac), \code{\\r\\n} (Windows CRLF), \code{\\r} (old Mac) and \code{\\n\\r} (just in case). There is no need to convert input files first. \code{fread} running on any architecture will read a file from any architecture. Both \code{\\r} and \code{\\n} may be embedded in character strings (including column names) provided the field is quoted. -\bold{Decimal separator and locale:} \code{fread(\dots,dec=",")} should just work. \code{fread} uses C function \code{strtod} to read numeric data; e.g., \code{1.23} or \code{1,23}. \code{strtod} retrieves the decimal separator (\code{.} or \code{,} usually) from the locale of the R session rather than as an argument passed to the \code{strtod} function. So for \code{fread(\dots,dec=",")} to work, \code{fread} changes this (and only this) R session's locale temporarily to a locale which provides the desired decimal separator. +\bold{Decimal separator:} \code{dec} is used to parse numeric fields as the separator between integral and fractional parts. When \code{dec='auto'}, during column type detection, when a field is a candidate for being numeric (i.e., parsing as lower types has already failed), \code{dec='.'} is tried, and, if it fails to create a numeric field, \code{dec=','} is tried. At the end of the sample lines, if more were successfully parsed with \code{dec=','}, \code{dec} is set to \code{','}; otherwise, \code{dec} is set to \code{'.'}. -On Windows, "French_France.1252" is tried which should be available as standard (any locale with comma decimal separator would suffice) and on unix "fr_FR.utf8" (you may need to install this locale on unix). \code{fread()} is very careful to set the locale back again afterwards, even if the function fails with an error. The choice of locale is determined by \code{options()$datatable.fread.dec.locale}. This may be a \emph{vector} of locale names and if so they will be tried in turn until the desired \code{dec} is obtained; thus allowing more than two different decimal separators to be selected. This is a new feature in v1.9.6 and is experimental. In case of problems, turn it off with \code{options(datatable.fread.dec.experiment=FALSE)}. +Automatic detection of \code{sep} occurs \emph{prior} to column type detection -- as such, it is possible that \code{sep} has been inferred to be \code{','}, in which case \code{dec} is set to \code{'.'}. \bold{Quotes:} diff --git a/src/fread.c b/src/fread.c index a1521fb37..e2602e596 100644 --- a/src/fread.c +++ b/src/fread.c @@ -33,6 +33,7 @@ static const char *sof, *eof; static char sep; static char whiteChar; // what to consider as whitespace to skip: ' ', '\t' or 0 means both (when sep!=' ' && sep!='\t') static char quote, dec; +static int linesForDecDot; // when dec='auto', track the balance of fields in favor of dec='.' vs dec=',', ties go to '.' static bool eol_one_r; // only true very rarely for \r-only files // Quote rule: @@ -1206,11 +1207,16 @@ static int detect_types( const char **pch, int8_t type[], int ncol, bool *bumped skip_white(&ch); if (eol(&ch)) return 0; // empty line int field=0; + const bool autoDec = dec == '\0'; while (field>(%d)"), strlim(ch,20), quoteRule); skip_white(&ch); const char *fieldStart = ch; while (tmpType[field]<=CT_STRING) { + if (autoDec && IS_DEC_TYPE(tmpType[field]) && dec == '\0') { // guess . first + dec = '.'; + } + fun[tmpType[field]](&fctx); if (end_of_field(ch)) break; skip_white(&ch); @@ -1234,9 +1240,19 @@ static int detect_types( const char **pch, int8_t type[], int ncol, bool *bumped } } ch = fieldStart; + if (autoDec && IS_DEC_TYPE(tmpType[field]) && dec == '.') { // . didn't parse a double; try , + dec = ','; + continue; + } while (++tmpType[field]=eof) break; // The 9th jump could reach the end in the same situation and that's ok. As long as the end is sampled is what we want. bool bumped = false; // did this jump find any different types; to reduce verbose output to relevant lines int jumpLine = 0; // line from this jump point start + linesForDecDot = 0; while(ch0, apply the bumps (if any) at the end of the successfully completed jump sample ASSERT(jump>0, "jump(%d)>0", jump); @@ -1906,7 +1936,17 @@ int freadMain(freadMainArgs _args) { if (args.header==NA_BOOL8) { for (int j=0; j0) for (int j=0; jCT_EMPTY) { args.header=true; diff --git a/src/fread.h b/src/fread.h index 1e2783643..89dea2592 100644 --- a/src/fread.h +++ b/src/fread.h @@ -36,6 +36,8 @@ typedef enum { NUMTYPE // placeholder for the number of types including drop; used for allocation and loop bounds } colType; +#define IS_DEC_TYPE(x) ((x) == CT_FLOAT64 || (x) == CT_FLOAT64_EXT) // types where dec matters + extern int8_t typeSize[NUMTYPE]; extern const char typeName[NUMTYPE][10]; extern const long double pow10lookup[301]; diff --git a/src/freadR.c b/src/freadR.c index 97fbfadac..035c76eda 100644 --- a/src/freadR.c +++ b/src/freadR.c @@ -102,9 +102,10 @@ SEXP freadR( error(_("Internal error: freadR sep not a single character. R level catches this.")); // # nocov args.sep = CHAR(STRING_ELT(sepArg,0))[0]; // '\0' when default "auto" was replaced by "" at R level - if (!(isString(decArg) && LENGTH(decArg)==1 && strlen(CHAR(STRING_ELT(decArg,0)))==1)) + if (!isString(decArg) || LENGTH(decArg)!=1 || strlen(CHAR(STRING_ELT(decArg,0)))>1) { error(_("Internal error: freadR dec not a single character. R level catches this.")); // # nocov - args.dec = CHAR(STRING_ELT(decArg,0))[0]; + } + args.dec = CHAR(STRING_ELT(decArg,0))[0]; // '\0' when default "auto" was replaced by "" at R level if (IS_FALSE(quoteArg)) { args.quote = '\0';