Skip to content

Commit

Permalink
Merge pull request #697 from markfairbanks/separate_wider_delim
Browse files Browse the repository at this point in the history
Implement `separate_wider_delim()`
  • Loading branch information
markfairbanks authored Nov 17, 2022
2 parents f9670f5 + 37c8ab7 commit b73325d
Show file tree
Hide file tree
Showing 8 changed files with 208 additions and 5 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,7 @@ export(separate)
export(separate.)
export(separate_rows)
export(separate_rows.)
export(separate_wider_delim)
export(setDTthreads)
export(slice)
export(slice.)
Expand Down
5 changes: 3 additions & 2 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# tidytable 0.9.2 (in development)

#### New functions
* `pick()`
* `map_vec()`
* `group_cols()`
* `map_vec()`
* `pick()`
* `separate_wider_delim()`

#### Functionality improvements
* `separate()`: Can now handle when too many or two few new names are
Expand Down
6 changes: 5 additions & 1 deletion R/separate.R
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
#' Separate a character column into multiple columns
#'
#' @description
#' _Superseded_
#'
#' `separate()` has been superseded by `separate_wider_delim()`.
#'
#' Separates a single column into multiple columns using a user supplied separator or regex.
#'
#' If a separator is not supplied one will be automatically detected.
#'
#' Note: Using automatic detection or regex will be slower than simple separators such as "," or ".".
#'
#' @param .df A data.frame or data.table
#' @param .df A data frame
#' @param col The column to split into multiple columns
#' @param into New column names to split into. A character vector.
#' Use `NA` to omit the variable in the output.
Expand Down
103 changes: 103 additions & 0 deletions R/separate_wider_delim.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
#' Separate a character column into multiple columns
#'
#' @description
#' Separates a single column into multiple columns
#'
#' @param .df A data frame
#' @param cols Columns to separate
#' @param delim Delimiter to separate on
#' @inheritParams rlang::args_dots_empty
#' @param names New column names to separate into
#' @param names_sep Names separator
#' @param names_repair Treatment of duplicate names. See `?vctrs::vec_as_names` for options/details.
#' @param too_few What to do when too few column names are supplied
#' @param too_many What to do when too many column names are supplied
#' @param cols_remove Should old columns be removed
#'
#' @export
#'
#' @examples
#' df <- tidytable(x = c("a", "a_b", "a_b", NA))
#'
#' df %>%
#' separate_wider_delim(x, delim = "_", names = c("left", "right"))
#'
#' df %>%
#' separate_wider_delim(x, delim = "_", names_sep = "")
separate_wider_delim <- function(.df,
cols,
delim,
...,
names = NULL,
names_sep = NULL,
names_repair = "check_unique",
too_few = c("align_start", "error"),
too_many = c("drop", "error"),
cols_remove = TRUE) {
check_required(cols)
check_required(delim)

cols <- tidyselect_names(.df, {{ cols }})

if (length(cols) > 1 && is.null(names_sep)) {
abort("`names_sep` must be provided when multiple columns are provided")
}

for (col in cols) {
t_str_split <- tstrsplit(.df[[col]], split = delim, fixed = TRUE)

split_length <- length(t_str_split)

names_null <- is.null(names)

if (names_null) {
names <- as.character(seq_len(split_length))
}

names_length <- length(names)

if (is.null(names_sep) && names_null) {
names_sep <- ""
}

if (!is.null(names_sep)) {
names <- paste(col, names, sep = names_sep)
}

if (names_length < split_length) {
too_few <- arg_match0(too_few, c("align_start", "error"))
if (too_few == "error") {
abort("Not enough column names supplied")
}
t_str_split <- t_str_split[1:names_length]
extra <- character()
} else if (names_length > split_length) {
too_many <- arg_match0(too_many, c("drop", "error"))
if (too_many == "error") {
abort("Too many column names supplied")
}
extra <- names[(split_length + 1):names_length]
names <- names[1:split_length]
} else {
extra <- character()
}

is_complete <- vec_detect_complete(names)

names <- names[is_complete]

t_str_split <- t_str_split[is_complete]

out <- dt_j(.df, (names) := ..t_str_split)

if (length(extra) > 0) {
out <- dt_j(out, (extra) := NA_character_)
}

if (cols_remove && col %notin% names) {
out <- dt_j(out, (col) := NULL)
}
}

df_name_repair(out, names_repair)
}
6 changes: 5 additions & 1 deletion man/separate..Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 5 additions & 1 deletion man/separate.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

52 changes: 52 additions & 0 deletions man/separate_wider_delim.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

34 changes: 34 additions & 0 deletions tests/testthat/test-separate_wider_delim.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# These tests are borrowed from
# https://github.com/tidyverse/tidyr/blob/main/tests/testthat/test-separate-wider.R

test_that("separate_wider_delim() can create column names", {
df <- tidytable(x = c("a b", "x y"))
out <- df %>% separate_wider_delim(x, " ", names_sep = "")
expect_equal(out$x1, c("a", "x"))
expect_equal(out$x2, c("b", "y"))
})

test_that("separate_wider_delim() can ignore problems", {
df <- tidytable(x = c("x", "x y", "x y z"))
out <- df %>% separate_wider_delim(x, " ",
names = c("a", "b"),
too_few = "align_start",
too_many = "drop",
)
expect_equal(out[1, ], tidytable(a = "x", b = NA_character_))
expect_equal(out[3, ], tidytable(a = "x", b = "y"))
})

test_that("separate_wider_delim() doesn't count NA input as problem", {
df <- tidytable(x = NA)
expect_equal(
df %>% separate_wider_delim(x, ",", names = c("a", "b")),
tidytable(a = NA_character_, b = NA_character_)
)
})

test_that("separate_wider_delim() validates its inputs", {
df <- tidytable(x = "x")
expect_error(df %>% separate_wider_delim())
expect_error(df %>% separate_wider_delim(x))
})

0 comments on commit b73325d

Please sign in to comment.