diff --git a/NEWS.md b/NEWS.md index 9163d9eb3f..e5dc3e4c0d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -42,6 +42,8 @@ 13. `dcast`gains `value.var.in.dots`, `value.var.in.LHSdots` and `value.var.in.RHSdots` arguments, [#5824](https://github.com/Rdatatable/data.table/issues/5824). This allows the `value.var` variable(s) in `dcast` to be represented by `...` in the formula (if not otherwise mentioned). Thanks to @iago-pssjd for the report and PR. +14. `fread` loads `.bgz` files directly, [#5461](https://github.com/Rdatatable/data.table/issues/5461). Thanks to @TMRHarrison for the request with proposed fix, and Benjamin Schwendinger for the PR. + ## BUG FIXES 1. `unique()` returns a copy the case when `nrows(x) <= 1` instead of a mutable alias, [#5932](https://github.com/Rdatatable/data.table/pull/5932). This is consistent with existing `unique()` behavior when the input has no duplicates but more than one row. Thanks to @brookslogan for the report and @dshemetov for the fix. diff --git a/R/fread.R b/R/fread.R index fc22e9c544..1ce637eaf8 100644 --- a/R/fread.R +++ b/R/fread.R @@ -116,10 +116,10 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") gz_signature = as.raw(c(0x1F, 0x8B)) bz2_signature = as.raw(c(0x42, 0x5A, 0x68)) gzsig = FALSE - if ((w <- endsWithAny(file, c(".gz",".bz2"))) || (gzsig <- identical(head(file_signature, 2L), gz_signature)) || identical(head(file_signature, 3L), bz2_signature)) { + if ((w <- endsWithAny(file, c(".gz", ".bgz",".bz2"))) || (gzsig <- identical(head(file_signature, 2L), gz_signature)) || identical(head(file_signature, 3L), bz2_signature)) { if (!requireNamespace("R.utils", quietly = TRUE)) stopf("To read gz and bz2 files directly, fread() requires 'R.utils' package which cannot be found. Please install 'R.utils' using 'install.packages('R.utils')'.") # nocov - FUN = if (w==1L || gzsig) gzfile else bzfile + FUN = if (w<=2L || gzsig) gzfile else bzfile R.utils::decompressFile(file, decompFile<-tempfile(tmpdir=tmpdir), ext=NULL, FUN=FUN, remove=FALSE) # ext is not used by decompressFile when destname is supplied, but isn't optional file = decompFile # don't use 'tmpFile' symbol again, as tmpFile might be the http://domain.org/file.csv.gz download on.exit(unlink(decompFile), add=TRUE) diff --git a/inst/tests/ch11b.dat.bgz b/inst/tests/ch11b.dat.bgz new file mode 100644 index 0000000000..5ef72ad015 Binary files /dev/null and b/inst/tests/ch11b.dat.bgz differ diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 90ba1f73d8..0df904956a 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17903,6 +17903,9 @@ if (test_R.utils) { test(2229.4, fread(f, logical01=FALSE)[,1], data.table(V1 = 1:100)) file.copy(testDir("issue_785_fread.txt.gz"), f, overwrite=TRUE) test(2229.5, fread(f, logical01=FALSE)[,25], data.table(Sv3 = c(10,14,14,15))) + # support .bgz with fread #5461 + file.copy(testDir("ch11b.dat.bgz"), f, overwrite=TRUE) + test(2229.55, fread(f, logical01=FALSE)[,1], data.table(V1 = 1:100)) } unlink(f) # not supporting multi file zips yet