Merge pull request #697 from markfairbanks/separate_wider_delim

Implement `separate_wider_delim()`
markfairbanks · Nov 17, 2022 · b73325d · b73325d
2 parents f9670f5 + 37c8ab7
commit b73325d
Show file tree

Hide file tree

Showing 8 changed files with 208 additions and 5 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -376,6 +376,7 @@ export(separate)
 export(separate.)
 export(separate_rows)
 export(separate_rows.)
+export(separate_wider_delim)
 export(setDTthreads)
 export(slice)
 export(slice.)

diff --git a/NEWS.md b/NEWS.md
@@ -1,9 +1,10 @@
 # tidytable 0.9.2 (in development)
 
 #### New functions
-* `pick()`
-* `map_vec()`
 * `group_cols()`
+* `map_vec()`
+* `pick()`
+* `separate_wider_delim()`
 
 #### Functionality improvements
 * `separate()`: Can now handle when too many or two few new names are

diff --git a/R/separate.R b/R/separate.R
@@ -1,13 +1,17 @@
 #' Separate a character column into multiple columns
 #'
 #' @description
+#' _Superseded_
+#'
+#' `separate()` has been superseded by `separate_wider_delim()`.
+#'
 #' Separates a single column into multiple columns using a user supplied separator or regex.
 #'
 #' If a separator is not supplied one will be automatically detected.
 #'
 #' Note: Using automatic detection or regex will be slower than simple separators such as "," or ".".
 #'
-#' @param .df A data.frame or data.table
+#' @param .df A data frame
 #' @param col The column to split into multiple columns
 #' @param into New column names to split into. A character vector.
 #' Use `NA` to omit the variable in the output.

diff --git a/R/separate_wider_delim.R b/R/separate_wider_delim.R
@@ -0,0 +1,103 @@
+#' Separate a character column into multiple columns
+#'
+#' @description
+#' Separates a single column into multiple columns
+#'
+#' @param .df A data frame
+#' @param cols Columns to separate
+#' @param delim Delimiter to separate on
+#' @inheritParams rlang::args_dots_empty
+#' @param names New column names to separate into
+#' @param names_sep Names separator
+#' @param names_repair Treatment of duplicate names. See `?vctrs::vec_as_names` for options/details.
+#' @param too_few What to do when too few column names are supplied
+#' @param too_many What to do when too many column names are supplied
+#' @param cols_remove Should old columns be removed
+#'
+#' @export
+#'
+#' @examples
+#' df <- tidytable(x = c("a", "a_b", "a_b", NA))
+#'
+#' df %>%
+#'   separate_wider_delim(x, delim = "_", names = c("left", "right"))
+#'
+#' df %>%
+#'   separate_wider_delim(x, delim = "_", names_sep = "")
+separate_wider_delim <- function(.df,
+                                 cols,
+                                 delim,
+                                 ...,
+                                 names = NULL,
+                                 names_sep = NULL,
+                                 names_repair = "check_unique",
+                                 too_few = c("align_start", "error"),
+                                 too_many = c("drop", "error"),
+                                 cols_remove = TRUE) {
+  check_required(cols)
+  check_required(delim)
+
+  cols <- tidyselect_names(.df, {{ cols }})
+
+  if (length(cols) > 1 && is.null(names_sep)) {
+    abort("`names_sep` must be provided when multiple columns are provided")
+  }
+
+  for (col in cols) {
+    t_str_split <- tstrsplit(.df[[col]], split = delim, fixed = TRUE)
+
+    split_length <- length(t_str_split)
+
+    names_null <- is.null(names)
+
+    if (names_null) {
+      names <- as.character(seq_len(split_length))
+    }
+
+    names_length <- length(names)
+
+    if (is.null(names_sep) && names_null) {
+      names_sep <- ""
+    }
+
+    if (!is.null(names_sep)) {
+      names <- paste(col, names, sep = names_sep)
+    }
+
+    if (names_length < split_length) {
+      too_few <- arg_match0(too_few, c("align_start", "error"))
+      if (too_few == "error") {
+        abort("Not enough column names supplied")
+      }
+      t_str_split <- t_str_split[1:names_length]
+      extra <- character()
+    } else if (names_length > split_length) {
+      too_many <- arg_match0(too_many, c("drop", "error"))
+      if (too_many == "error") {
+        abort("Too many column names supplied")
+      }
+      extra <- names[(split_length + 1):names_length]
+      names <- names[1:split_length]
+    } else {
+      extra <- character()
+    }
+
+    is_complete <- vec_detect_complete(names)
+
+    names <- names[is_complete]
+
+    t_str_split <- t_str_split[is_complete]
+
+    out <- dt_j(.df, (names) := ..t_str_split)
+
+    if (length(extra) > 0) {
+      out <- dt_j(out, (extra) := NA_character_)
+    }
+
+    if (cols_remove && col %notin% names) {
+      out <- dt_j(out, (col) := NULL)
+    }
+  }
+
+  df_name_repair(out, names_repair)
+}
diff --git a/man/separate..Rd b/man/separate..Rd
diff --git a/man/separate.Rd b/man/separate.Rd
diff --git a/man/separate_wider_delim.Rd b/man/separate_wider_delim.Rd
diff --git a/tests/testthat/test-separate_wider_delim.R b/tests/testthat/test-separate_wider_delim.R
@@ -0,0 +1,34 @@
+# These tests are borrowed from
+# https://github.com/tidyverse/tidyr/blob/main/tests/testthat/test-separate-wider.R
+
+test_that("separate_wider_delim() can create column names", {
+  df <- tidytable(x = c("a b", "x y"))
+  out <- df %>% separate_wider_delim(x, " ", names_sep = "")
+  expect_equal(out$x1, c("a", "x"))
+  expect_equal(out$x2, c("b", "y"))
+})
+
+test_that("separate_wider_delim() can ignore problems", {
+  df <- tidytable(x = c("x", "x y", "x y z"))
+  out <- df %>% separate_wider_delim(x, " ",
+                                     names = c("a", "b"),
+                                     too_few = "align_start",
+                                     too_many = "drop",
+  )
+  expect_equal(out[1, ], tidytable(a = "x", b = NA_character_))
+  expect_equal(out[3, ], tidytable(a = "x", b = "y"))
+})
+
+test_that("separate_wider_delim() doesn't count NA input as problem", {
+  df <- tidytable(x = NA)
+  expect_equal(
+    df %>% separate_wider_delim(x, ",", names = c("a", "b")),
+    tidytable(a = NA_character_, b = NA_character_)
+  )
+})
+
+test_that("separate_wider_delim() validates its inputs", {
+  df <- tidytable(x = "x")
+  expect_error(df %>% separate_wider_delim())
+  expect_error(df %>% separate_wider_delim(x))
+})