From 277e14a55a05843ddcea735c57d4b4e5a13b888e Mon Sep 17 00:00:00 2001
From: Michael Chirico <michael.chirico@grabtaxi.com>
Date: Thu, 8 Aug 2019 10:10:24 +0800
Subject: [PATCH] keep.rownames argument for transpose (#3715)

---
 NEWS.md               |   2 +
 R/transpose.R         |  29 ++++++++----
 inst/tests/tests.Rraw |  16 ++++++-
 man/transpose.Rd      |   5 +-
 src/transpose.c       | 108 +++++++++++++++++++++---------------------
 5 files changed, 94 insertions(+), 66 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 030207708..f6a44b54d 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -157,6 +157,8 @@
     # data.table::fifelse(x, 7L, 11L)  0.4  0.4  0.5     5  # setDTthreads(4)
     ```
 
+22. `transpose` gains `keep.names=` and `make.names=` arguments, [#1886](https://github.com/Rdatatable/data.table/issues/1886). Previously, column names were dropped and there was no way to keep them. `keep.names="rn"` keeps the column names and puts them in the `"rn"` column of the result. Similarly, `make.names="rn"` uses column `"rn"` as the column names of the result. Both arguments are `NULL` by default for backwards compatibility. As these new arguments are new, they are subject to change in future according to community feedback. Thanks to @ghost for the request.
+
 #### BUG FIXES
 
 1. `first`, `last`, `head` and `tail` by group no longer error in some cases, [#2030](https://github.com/Rdatatable/data.table/issues/2030) [#3462](https://github.com/Rdatatable/data.table/issues/3462). Thanks to @franknarf1 for reporting.
diff --git a/R/transpose.R b/R/transpose.R
index 5333dd788..2d275e0a8 100644
--- a/R/transpose.R
+++ b/R/transpose.R
@@ -1,12 +1,24 @@
-transpose = function(l, fill=NA, ignore.empty=FALSE) {
-  ans = .Call(Ctranspose, l, fill, ignore.empty)
-  if (is.data.table(l)) setDT(ans)
-  else if (is.data.frame(l)) {
-    if (is.null(names(ans)))
-      setattr(ans, "names", paste0("V", seq_along(ans)))
-    setattr(ans, "row.names", .set_row_names(length(ans[[1L]])))
-    setattr(ans, "class", "data.frame")
+transpose = function(l, fill=NA, ignore.empty=FALSE, keep.names=NULL, make.names=NULL) {
+  if (!is.null(make.names)) {
+    stopifnot(length(make.names)==1L)
+    if (is.character(make.names)) {
+      make.names=chmatch(make.names, names(l))
+      if (is.na(make.names))
+        stop("make.names='",make.names,"' not found in names of input")
+    } else {
+      make.names = as.integer(make.names)
+      if (is.na(make.names) || make.names<1L || make.names>length(l))
+        stop("make.names=",make.names," is out of range [1,ncol=",length(l),"]")
+    }
+    colnames = as.character(l[[make.names]])
+    l = if (is.data.table(l)) l[,-make.names,with=FALSE] else l[-make.names]
   }
+  ans = .Call(Ctranspose, l, fill, ignore.empty, keep.names)
+  if (!is.null(make.names)) setattr(ans, "names", c(keep.names, colnames))
+  else if (is.data.frame(l))  # including data.table but not plain list
+    setattr(ans, "names", c(keep.names, paste0("V", seq_len(length(ans)-length(keep.names)))))
+  if (is.data.table(l)) setDT(ans)
+  else if (is.data.frame(l)) setDF(ans)
   ans[]
 }
 
@@ -34,3 +46,4 @@ tstrsplit = function(x, ..., fill=NA, type.convert=FALSE, keep, names=FALSE) {
   setattr(ans, 'names', names)
   ans
 }
+
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index ddecbf312..2a7d5d65e 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -6696,7 +6696,7 @@ test(1477.07, transpose(strsplit(ll, ",", fixed=TRUE)), tstrsplit(ll, ",", fixed
 test(1477.08, transpose(1:5), error="l must be a list")
 test(1477.09, transpose(list(as.complex(c(1, 1+5i)))), error="Unsupported column type")
 test(1477.10, transpose(list(list(1:5))), error="Item 1 of list input is")
-test(1477.11, transpose(as.list(1:5), fill=1:2), error="fill must be NULL or length=1 vector")
+test(1477.11, transpose(as.list(1:5), fill=1:2), error="fill must be a length 1 vector")
 test(1477.12, transpose(as.list(1:5), ignore.empty=NA), error="ignore.empty should be logical TRUE/FALSE")
 test(1477.13, transpose(list()), list())
 
@@ -15509,6 +15509,20 @@ test(2072.066, fifelse(c(TRUE, TRUE, TRUE, FALSE, NA, FALSE), factor(letters[1:6
 test(2072.067, fifelse(c(TRUE, NA, TRUE, FALSE, FALSE, FALSE), factor(NA), factor(NA)),
                factor(c(NA,NA,NA,NA,NA,NA)))
 
+DT = data.table(x=1:5, y=6:10)
+test(2073.1, transpose(DT, keep.names="rn"),
+             ans<-data.table(rn=c('x','y'), V1=c(1L, 6L), V2=c(2L, 7L), V3=c(3L, 8L), V4=c(4L, 9L), V5=c(5L, 10L)))
+test(2073.2, transpose(DT, keep.names=TRUE), error="either NULL.*name of the first column of the result")
+test(2073.3, transpose(ans, make.names="rn"), DT)
+test(2073.4, transpose(ans, keep.names="rn", make.names="rn"), data.table(rn=paste0("V",1:5), x=1:5, y=6:10))
+L = list(a=1:3, rn=LETTERS[1:3], b=4:6)
+test(2073.5, transpose(L, make.names=0),  error="make.names=0 is out of range [1,ncol=3]")
+test(2073.6, transpose(L, make.names=4),  error="make.names=4 is out of range [1,ncol=3]")
+test(2073.7, transpose(L, make.names=NA), error="make.names=NA is out of range [1,ncol=3]")
+test(2073.8, transpose(L, make.names=2), list(A=INT(1,4), B=INT(2,5), C=INT(3,6)))
+test(2073.9, transpose(L, make.names=2, keep.names='foo'), list(foo=c("a","b"), A=INT(1,4), B=INT(2,5), C=INT(3,6)))
+
+
 ###################################
 #  Add new tests above this line  #
 ###################################
diff --git a/man/transpose.Rd b/man/transpose.Rd
index 9281ca122..547688fd6 100644
--- a/man/transpose.Rd
+++ b/man/transpose.Rd
@@ -6,12 +6,14 @@
 }
 
 \usage{
-transpose(l, fill=NA, ignore.empty=FALSE)
+transpose(l, fill=NA, ignore.empty=FALSE, keep.names=NULL, make.names=NULL)
 }
 \arguments{
   \item{l}{ A list, data.frame or data.table. }
   \item{fill}{ Default is \code{NA}. It is used to fill shorter list elements so as to return each element of the transposed result of equal lengths. }
   \item{ignore.empty}{Default is \code{FALSE}. \code{TRUE} will ignore length-0 list elements.}
+  \item{keep.names}{The name of the first column in the result containing the names of the input; e.g. \code{keep.names="rn"}. By default \code{NULL} and the names of the input are discarded.}
+  \item{make.names}{The name or number of a column in the input to use as names of the output; e.g. \code{make.names="rn"}. By default \code{NULL} and default names are given to the output columns.}
 }
 \details{
   The list elements (or columns of \code{data.frame}/\code{data.table}) should be all \code{atomic}. If list elements are of unequal lengths, the value provided in \code{fill} will be used so that the resulting list always has all elements of identical lengths. The class of input object is also preserved in the transposed result.
@@ -21,7 +23,6 @@ transpose(l, fill=NA, ignore.empty=FALSE)
   This is particularly useful in tasks that require splitting a character column and assigning each part to a separate column. This operation is quite common enough that a function \code{\link{tstrsplit}} is exported.
 
   \code{factor} columns are converted to \code{character} type. Attributes are not preserved at the moment. This may change in the future.
-
 }
 \value{
   A transposed \code{list}, \code{data.frame} or \code{data.table}.
diff --git a/src/transpose.c b/src/transpose.c
index 717e95b99..567a9a80c 100644
--- a/src/transpose.c
+++ b/src/transpose.c
@@ -2,99 +2,97 @@
 #include <Rdefines.h>
 #include <time.h>
 
-SEXP transpose(SEXP l, SEXP fill, SEXP ignoreArg) {
+SEXP transpose(SEXP l, SEXP fill, SEXP ignoreArg, SEXP keepNamesArg) {
 
+  int nprotect=0;
   if (!isNewList(l))
     error("l must be a list.");
   if (!length(l))
     return(duplicate(l));
-  if (!isLogical(ignoreArg) || LOGICAL(ignoreArg)[0] == NA_LOGICAL)
+  if (!isLogical(ignoreArg) || LOGICAL(ignoreArg)[0]==NA_LOGICAL)
     error("ignore.empty should be logical TRUE/FALSE.");
+  bool ignore = LOGICAL(ignoreArg)[0];
+  if (!(isNull(keepNamesArg) || (isString(keepNamesArg) && LENGTH(keepNamesArg)==1)))
+    error("keep.names should be either NULL, or the name of the first column of the result in which to place the names of the input");
+  bool rn = !isNull(keepNamesArg);
   if (length(fill) != 1)
-    error("fill must be NULL or length=1 vector.");
+    error("fill must be a length 1 vector, such as the default NA");
   R_len_t ln = LENGTH(l);
-  Rboolean ignore = LOGICAL(ignoreArg)[0];
 
   // preprocessing
-  R_len_t *len  = (R_len_t *)R_alloc(ln, sizeof(R_len_t));
   int maxlen=0, zerolen=0;
   SEXPTYPE maxtype=0;
   for (int i=0; i<ln; ++i) {
     SEXP li = VECTOR_ELT(l, i);
     if (!isVectorAtomic(li) && !isNull(li))
       error("Item %d of list input is not an atomic vector", i+1);
-    len[i] = length(li);
-    if (len[i] > maxlen)
-      maxlen = len[i];
-    zerolen += (len[i] == 0);
-    if (isFactor(li)) {
-      maxtype = STRSXP;
-    } else {
-      SEXPTYPE type = TYPEOF(li);
-      if (type > maxtype)
-        maxtype = type;
-    }
+    const int len = length(li);
+    if (len>maxlen) maxlen=len;
+    zerolen += (len==0);
+    SEXPTYPE type = TYPEOF(li);
+    if (isFactor(li)) type=STRSXP;
+    if (type>maxtype) maxtype=type;
   }
-  // coerce fill to maxtype
-  fill = PROTECT(coerceVector(fill, maxtype));
+  fill = PROTECT(coerceVector(fill, maxtype)); nprotect++;
 
-  // allocate 'ans'
-  SEXP ans = PROTECT(allocVector(VECSXP, maxlen));
-  int anslen = (!ignore) ? ln : (ln - zerolen);
+  SEXP ans = PROTECT(allocVector(VECSXP, maxlen+rn)); nprotect++;
+  int anslen = (ignore) ? (ln - zerolen) : ln;
+  if (rn) {
+    SEXP tt;
+    SET_VECTOR_ELT(ans, 0, tt=allocVector(STRSXP, anslen));
+    SEXP lNames = PROTECT(getAttrib(l, R_NamesSymbol)); nprotect++;
+    for (int i=0, j=0; i<ln; ++i) {
+      if (length(VECTOR_ELT(l, i))) SET_STRING_ELT(tt, j++, STRING_ELT(lNames, i));
+    }
+  }
   for (int i=0; i<maxlen; ++i) {
-    SET_VECTOR_ELT(ans, i, allocVector(maxtype, anslen));
+    SET_VECTOR_ELT(ans, i+rn, allocVector(maxtype, anslen));
   }
-
-  // transpose
-  int k=0;
-  for (int i=0; i<ln; ++i) {
-    if (ignore && !len[i]) continue;
+  const SEXP *ansp = VECTOR_PTR(ans);
+  for (int i=0, k=0; i<ln; ++i) {
     SEXP li = VECTOR_ELT(l, i);
+    const int len = length(li);
+    if (ignore && len==0) continue;
     bool coerce = false;
     if (TYPEOF(li) != maxtype) {
-      coerce = true;
       li = PROTECT(isFactor(li) ? asCharacterFactor(li) : coerceVector(li, maxtype));
+      coerce = true;
     }
-    switch (maxtype) { // TODO remove more macros
-    case INTSXP : {
-      const int *ili = INTEGER(li);
-      const int *ifill = INTEGER(fill);
-      for (int j=0; j<maxlen; ++j) {
-        SEXP thisi = VECTOR_ELT(ans, j);
-        INTEGER(thisi)[k] = (j < len[i]) ? ili[j] : ifill[0];
-      }
-    }
-      break;
+    switch (maxtype) {
     case LGLSXP : {
       const int *ili = LOGICAL(li);
-      const int *ifill = LOGICAL(fill);
+      const int ifill = LOGICAL(fill)[0];
       for (int j=0; j<maxlen; ++j) {
-        SEXP thisi = VECTOR_ELT(ans, j);
-        LOGICAL(thisi)[k] = (j < len[i]) ? ili[j] : ifill[0];
+        LOGICAL(ansp[j+rn])[k] = j<len ? ili[j] : ifill;
       }
-    }
-      break;
+    } break;
+    case INTSXP : {
+      const int *ili = INTEGER(li);
+      const int ifill = INTEGER(fill)[0];
+      for (int j=0; j<maxlen; ++j) {
+        INTEGER(ansp[j+rn])[k] = j<len ? ili[j] : ifill;
+      }
+    } break;
     case REALSXP : {
       const double *dli = REAL(li);
-      const double *dfill = REAL(fill);
+      const double dfill = REAL(fill)[0];
       for (int j=0; j<maxlen; ++j) {
-        SEXP thisi = VECTOR_ELT(ans, j);
-        REAL(thisi)[k] = (j < len[i]) ? dli[j] : dfill[0];
+        REAL(ansp[j+rn])[k] = j<len ? dli[j] : dfill;
       }
-    }
-      break;
-    case STRSXP :
+    } break;
+    case STRSXP : {
+      const SEXP sfill = STRING_ELT(fill, 0);
       for (int j=0; j<maxlen; ++j) {
-        SEXP thisi = VECTOR_ELT(ans, j);
-        SET_STRING_ELT(thisi, k, (j < len[i]) ? STRING_ELT(li, j) : STRING_ELT(fill, 0));
+        SET_STRING_ELT(ansp[j+rn], k, j<len ? STRING_ELT(li, j) : sfill);
       }
-      break;
+    } break;
     default :
-        error("Unsupported column type '%s'", type2char(maxtype));
+      error("Unsupported column type '%s'", type2char(maxtype));
     }
     if (coerce) UNPROTECT(1);
     k++;
   }
-  UNPROTECT(2);
-  return(ans);
+  UNPROTECT(nprotect);
+  return ans;
 }
+