From 59137b0a2af43058658accbfd5d6c5c6ba0b21c6 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 14 Feb 2019 00:04:20 +0800 Subject: [PATCH] Modernize ?setkey; delete long-deprecated functions; part of #2572 --- NAMESPACE | 4 +-- R/setkey.R | 20 ----------- man/setkey.Rd | 98 ++++++++++++++++++--------------------------------- 3 files changed, 36 insertions(+), 86 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index acc44ad51..b27b52a14 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,8 +5,8 @@ import(methods) exportClasses(data.table, IDate, ITime) ## -export(data.table, tables, setkey, setkeyv, key, "key<-", haskey, CJ, SJ, copy) -export(set2key, set2keyv, key2, setindex, setindexv, indices) +export(data.table, tables, setkey, setkeyv, key, haskey, CJ, SJ, copy) +export(setindex, setindexv, indices) export(as.data.table,is.data.table,test.data.table,last,first,like,"%like%",between,"%between%",inrange,"%inrange%") export(timetaken) export(truelength, alloc.col, ":=") diff --git a/R/setkey.R b/R/setkey.R index a7a03cdac..bcd7a39d7 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -18,16 +18,6 @@ setindexv <- function(x, cols, verbose=getOption("datatable.verbose")) { } } -set2key <- function(...) { - stop("set2key() is now deprecated. Please use setindex() instead.") -} -set2keyv <- function(...) { - stop("set2keyv() is now deprecated. Please use setindexv() instead.") -} -key2 <- function(x) { - stop("key2() is now deprecated. Please use indices() instead.") -} - setkeyv <- function(x, cols, verbose=getOption("datatable.verbose"), physical=TRUE) { if (is.null(cols)) { # this is done on a data.frame when !cedta at top of [.data.table @@ -127,16 +117,6 @@ getindex <- function(x, name) { ans } -"key<-" <- function(x,value) { - warning("The key(x)<-value form of setkey can copy the whole table. This is due to <- in R itself. Please change to setkeyv(x,value) or setkey(x,...) which do not copy and are faster. See help('setkey'). You can safely ignore this warning if it is inconvenient to change right now. Setting options(warn=2) turns this warning into an error, so you can then use traceback() to find and change your key<- calls.") - setkeyv(x,value) - # The returned value here from key<- is then copied by R before assigning to x, it seems. That's - # why we can't do anything about it without a change in R itself. If we return NULL (or invisible()) from this key<- - # method, the table gets set to NULL. So, although we call setkeyv(x,cols) here, and that doesn't copy, the - # returned value (x) then gets copied by R. - # So, solution is that caller has to call setkey or setkeyv directly themselves, to avoid <- dispatch and its copy. -} - haskey <- function(x) !is.null(key(x)) # reverse a vector by reference (no copy) diff --git a/man/setkey.Rd b/man/setkey.Rd index 62df12ba1..6265e2463 100644 --- a/man/setkey.Rd +++ b/man/setkey.Rd @@ -2,37 +2,29 @@ \alias{setkey} \alias{setkeyv} \alias{key} -\alias{key<-} \alias{haskey} -\alias{set2key} -\alias{set2keyv} \alias{setindex} \alias{setindexv} -\alias{key2} \alias{indices} -\title{ Create key on a data table } +\title{ Create key on a data.table } \description{ In \code{data.table} parlance, all \code{set*} functions change their input \emph{by reference}. That is, no copy is made at all, other than temporary working memory, which is as large as one column. The only other \code{data.table} operator that modifies input by reference is \code{\link{:=}}. Check out the -\code{See Also} section below for other \code{set*} function \code{data.table} +\code{See Also} section below for other \code{set*} functions \code{data.table} provides. -\code{setkey()} sorts a \code{data.table} and marks it as sorted (with an -attribute \code{sorted}). The sorted columns are the key. The key can be any -columns in any order. The columns are sorted in ascending order always. The table -is changed \emph{by reference} and is therefore very memory efficient. +\code{setkey} sorts a \code{data.table} and marks it as sorted (with an +attribute \code{sorted}). The sorted columns are the key. The key can be any number of columns in any order. The columns are always sorted in \emph{ascending} order. The table is changed \emph{by reference} and \code{setkey} is therefore very memory efficient. -\code{setindex()} creates an index (or indices) on provided columns. This index is simply an -order of the dataset's according to the provided columns. This order is stored as a \code{data.table} -attribute, and the dataset retains the original order in memory. +\code{setindex} creates an index (or indices) on provided columns. This index is simply an ordering vector of the data set's rows according to the provided columns. For example, if the ordering of \code{x} for columns \code{i} is \code{index_i}, then the result of \code{x[index_i]} will be sorted by \code{i}. This order is stored as an attribute of the \code{data.table}, and the dataset retains the original order in memory. See the \href{vignettes/datatable-secondary-indices-and-auto-indexing.html}{Secondary indices and auto indexing} vignette for more details. -\code{key()} returns the \code{data.table}'s key if it exists, and \code{NULL} -if none exist. +\code{key} returns the \code{data.table}'s key if it exists, and \code{NULL} +if none exists. -\code{haskey()} returns a logical \code{TRUE}/\code{FALSE} depending on whether +\code{haskey} returns a logical \code{TRUE}/\code{FALSE} depending on whether the \code{data.table} has a key (or not). } \usage{ @@ -43,80 +35,59 @@ setindexv(x, cols, verbose=getOption("datatable.verbose")) key(x) indices(x, vectors = FALSE) haskey(x) -key(x) <- value # DEPRECATED, please use setkey or setkeyv instead. } \arguments{ \item{x}{ A \code{data.table}. } -\item{\dots}{ The columns to sort by. Do not quote the column names. If -\code{\dots} is missing (i.e. \code{setkey(DT)}), all the columns are used. -\code{NULL} removes the key. } -\item{cols}{ A character vector of column names. For \code{setindexv}, this can be a \code{list} of character vectors, in which case each element will be applied as an index. } -\item{value}{ In (deprecated) \code{key<-}, a character vector (only) of column -names.} +\item{\dots}{ The columns to sort by. Do not quote the column names. If \code{\dots} is missing (i.e. \code{setkey(DT)}), all the columns are used. \code{NULL} removes the key. } +\item{cols}{ A character vector of column names. For \code{setindexv}, this can be a \code{list} of character vectors, in which case each element will be applied as an index in turn. } \item{verbose}{ Output status and information. } -\item{physical}{ TRUE changes the order of the data in RAM. FALSE adds a -secondary key a.k.a. index. } -\item{vectors}{ logical scalar default \code{FALSE}, when set to \code{TRUE} -then list of character vectors is returned, each vector refers to one index. } +\item{physical}{ \code{TRUE} changes the order of the data in RAM. \code{FALSE} adds an index. } +\item{vectors}{ \code{logical} scalar, default \code{FALSE}; when set to \code{TRUE}, a \code{list} of character vectors is returned, each referring to one index. } } \details{ -\code{setkey} reorders (or sorts) the rows of a data.table by the columns -provided. In versions \code{1.9+}, for \code{integer} columns, a modified version -of base's counting sort is implemented, which allows negative values as well. It -is extremely fast, but is limited by the range of integer values being <= 1e5. If -that fails, it falls back to a (fast) 4-pass radix sort for integers, implemented -based on Pierre Terdiman's and Michael Herf's code (see links below). Similarly, +\code{setkey} reorders (or sorts) the rows of a \code{data.table} by the columns +provided. For \code{integer} columns, a modified version +of \code{base}'s counting sort is implemented, which allows negative values as well. It is extremely fast, but is limited by the range of integer values being <= 1e5. If that fails, it falls back to a (fast) 4-pass radix sort for integers, implemented based on Pierre Terdiman's and Michael Herf's code (see links below). Similarly, a very fast 6-pass radix order for columns of type \code{double} is also implemented. -This gives a speed-up of about 5-8x compared to \code{1.8.10} on \code{setkey} -and all internal \code{order}/\code{sort} operations. Fast radix sorting is also -implemented for \code{character} and \code{bit64::integer64} types. -The sort is \emph{stable}; i.e., the order of ties (if any) is preserved, in both -versions - \code{<=1.8.10} and \code{>= 1.9.0}. +Fast radix sorting is also implemented for \code{character} and \code{bit64::integer64} types. -In \code{data.table} versions \code{<= 1.8.10}, for columns of type \code{integer}, -the sort is attempted with the very fast \code{"radix"} method in -\code{\link[base:order]{sort.list}}. If that fails, the sort reverts to the default -method in \code{\link[base]{order}}. For character vectors, \code{data.table} -takes advantage of R's internal global string cache and implements a very efficient -order, also exported as \code{\link{chorder}}. +The sort is \emph{stable}; i.e., the order of ties (if any) is preserved. -In v1.7.8, the \code{key<-} syntax was deprecated. The \code{<-} method copies -the whole table and we know of no way to avoid that copy without a change in -\R itself. Please use the \code{set}* functions instead, which make no copy at -all. \code{setkey} accepts unquoted column names for convenience, whilst -\code{setkeyv} accepts one vector of column names. +For character vectors, \code{data.table} takes advantage of R's internal global string cache and implements a very efficient order, also exported as \code{\link{chorder}}. + +Please also use the other \code{set*} functions which modify +objects by reference, rather than using the \code{<-} operator which results +in copying the entire object. In earlier versions of \code{data.table}, this +was learned through the lesson of the (long-deprecated) function \code{key<-}, +the predecessor to \code{setkey}. The problem (for \code{data.table}) with the copy by \code{key<-} (other than -being slower) is that \R doesn't maintain the over allocated truelength, but it +being slower) is that \R doesn't maintain the over-allocated truelength, but it looks as though it has. Adding a column by reference using \code{:=} after a \code{key<-} was therefore a memory overwrite and eventually a segfault; the -over allocated memory wasn't really there after \code{key<-}'s copy. \code{data.table}s -now have an attribute \code{.internal.selfref} to catch and warn about such copies. +over-allocated memory wasn't really there after \code{key<-}'s copy. \code{data.table}s now have an attribute \code{.internal.selfref} to catch and warn about such copies. This attribute has been implemented in a way that is friendly with \code{identical()} and \code{object.size()}. -For the same reason, please use the other \code{set*} functions which modify -objects by reference, rather than using the \code{<-} operator which results -in copying the entire object. - It isn't good programming practice, in general, to use column numbers rather than names. This is why \code{setkey} and \code{setkeyv} only accept column names. If you use column numbers then bugs (possibly silent) can more easily creep into your code as time progresses if changes are made elsewhere in your code; e.g., if you add, remove or reorder columns in a few months time, a \code{setkey} by column number will then refer to a different column, possibly returning incorrect results -with no warning. (A similar concept exists in SQL, where \code{"select * from \dots"} -is considered poor programming style when a robust, maintainable system is +with no warning. (A similar concept exists in SQL, where \code{"select * from \dots"} is considered poor programming style when a robust, maintainable system is required.) If you really wish to use column numbers, it is possible but -deliberately a little harder; e.g., \code{setkeyv(DT,colnames(DT)[1:2])}. +deliberately a little harder; e.g., \code{setkeyv(DT,colnames(DT)[1:2])}. +If you wanted to use \code{\link[base]{grep}} to select key columns according to +a pattern, note that you can just set \code{value = TRUE} to return a character vector instead of the default integer indices. } \value{ The input is modified by reference, and returned (invisibly) so it can be used -in compound statements; e.g., \code{setkey(DT,a)[J("foo")]}. If you require a -copy, take a copy first (using \code{DT2=copy(DT)}). \code{copy()} may also +in compound statements; e.g., \code{setkey(DT,a)[.("foo")]}. If you require a +copy, take a copy first (using \code{DT2=copy(DT)}). \code{\link{copy}} may also sometimes be useful before \code{:=} is used to subassign to a column by -reference. See \code{?copy}. +reference. } \references{ \url{https://en.wikipedia.org/wiki/Radix_sort}\cr @@ -152,7 +123,7 @@ DT # after tables() # KEY column reports the key'd columns key(DT) keycols = c("A","B") -setkeyv(DT,keycols) # rather than key(DT)<-keycols (which copies entire table) +setkeyv(DT,keycols) DT = data.table(A=5:1,B=letters[5:1]) DT2 = DT # does not copy @@ -172,4 +143,3 @@ indices(DT) # get indices single vector indices(DT, vectors = TRUE) # get indices list } \keyword{ data } -