Skip to content

Commit

Permalink
ALTREP implementation of growable_*
Browse files Browse the repository at this point in the history
Currently broken:
- 1 errors out of 11637
 - won't work with expression vectors at all
  • Loading branch information
aitap committed Dec 29, 2024
1 parent 7b9b141 commit 61fdc06
Show file tree
Hide file tree
Showing 8 changed files with 280 additions and 10 deletions.
4 changes: 2 additions & 2 deletions src/assign.c
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ SEXP shallowwrapper(SEXP dt, SEXP cols) {
}

SEXP truelength(SEXP x) {
return ScalarInteger(isNull(x) ? 0 : growable_max_size(x));
return ScalarInteger(is_growable(x) ? growable_max_size(x) : 0);
}

SEXP selfrefokwrapper(SEXP x, SEXP verbose) {
Expand Down Expand Up @@ -520,7 +520,7 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values)
// modify DT by reference. Other than if new columns are being added and the allocVec() fails with
// out-of-memory. In that case the user will receive hard halt and know to rerun.
if (length(newcolnames)) {
oldtncol = growable_max_size(dt); // TO DO: oldtncol can be just called tl now, as we won't realloc here any more.
oldtncol = is_growable(dt) ? growable_max_size(dt) : 0; // TO DO: oldtncol can be just called tl now, as we won't realloc here any more.

if (oldtncol<oldncol) {
if (oldtncol==0) error(_("This data.table has either been loaded from disk (e.g. using readRDS()/load()) or constructed manually (e.g. using structure()). Please run setDT() or setalloccol() on it first (to pre-allocate space for new columns) before assigning by reference to it.")); // #2996
Expand Down
6 changes: 6 additions & 0 deletions src/data.table.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@
#if R_VERSION < R_Version(3, 4, 0)
# define SET_GROWABLE_BIT(x) // #3292
#endif
#if R_VERSION >= R_Version(4, 3, 0)
# define USE_GROWABLE_ALTREP
#endif
#include <Rinternals.h>
#define SEXPPTR_RO(x) ((const SEXP *)DATAPTR_RO(x)) // to avoid overhead of looped STRING_ELT and VECTOR_ELT
#include <stdint.h> // for uint64_t rather than unsigned long long
Expand Down Expand Up @@ -309,6 +312,9 @@ void growable_resize(SEXP x, R_xlen_t newsize);
Rboolean is_growable(SEXP x);
// Transform x into a growable vector. The return value must be reprotected in place of x. What happens to x is deliberately not specified, but no copying occurs.
SEXP make_growable(SEXP x);
#if R_VERSION >= R_Version(4, 3, 0)
void register_altrep_classes(DllInfo*);
#endif

// functions called from R level .Call/.External and registered in init.c
// these now live here to pass -Wstrict-prototypes, #5477
Expand Down
5 changes: 2 additions & 3 deletions src/dogroups.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,10 @@ static bool anySpecialStatic(SEXP x, hashtab * specials) {
// (see data.table.h), and isNewList() is true for NULL
if (n==0)
return false;
if (hash_lookup(specials, x, 0)<0) return true; // test 2158
if (isVectorAtomic(x))
return ALTREP(x) || hash_lookup(specials, x, 0)<0;
return ALTREP(x); // see test 2156: ALTREP is a source of sharing we can't trace reliably
if (isNewList(x)) {
if (hash_lookup(specials, x, 0)<0)
return true; // test 2158
for (int i=0; i<n; ++i) {
list_el = VECTOR_ELT(x,i);
if (anySpecialStatic(list_el, specials))
Expand Down
238 changes: 237 additions & 1 deletion src/growable.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#include "data.table.h"

#ifndef USE_GROWABLE_ALTREP

SEXP growable_allocate(SEXPTYPE type, R_xlen_t size, R_xlen_t max_size) {
SEXP ret = PROTECT(allocVector(type, max_size));
SET_TRUELENGTH(ret, max_size);
Expand Down Expand Up @@ -32,9 +34,243 @@ Rboolean is_growable(SEXP x) {
;
}

// Assuming no ALTREP for now
// Assuming no ALTREP columns
SEXP make_growable(SEXP x) {
if (TRUELENGTH(x) < XLENGTH(x)) SET_TRUELENGTH(x, XLENGTH(x));
SET_GROWABLE_BIT(x);
return x;
}

#else

#include <R_ext/Altrep.h>

static R_altrep_class_t dta_grow_string, dta_grow_integer, dta_grow_logical, dta_grow_real, dta_grow_complex, dta_grow_raw, dta_grow_list;
static Rcomplex NA_COMPLEX = { 0, };

/*
ALTREP class layout:
data1 = underlying vector
data2 = its current length stored as a length-1 REALSXP
Unless we implement an Unserialize method, this can be changed any time.
Classes have been released on CRAN with a Serialized_state/Unserialize pair will have to stay as they have been defined in order to keep *.rds files readable.
*/

static R_xlen_t altall_Length(SEXP x) {
return (R_xlen_t)REAL(R_altrep_data2(x))[0];
}

#define make_inspect_method(classname) \
static Rboolean alt##classname##_Inspect( \
SEXP x, int pre, int deep, int pvec, \
void (*inspect_subtree)(SEXP x, int pre, int deep, int pvec) \
) { \
(void)pre; (void)deep; (void)pvec; (void)inspect_subtree; \
Rprintf("data.table::growable" #classname "_v0(truelength=%g) ", (double)XLENGTH(R_altrep_data1(x))); \
return FALSE; \
}
make_inspect_method(string)
make_inspect_method(integer)
make_inspect_method(logical)
make_inspect_method(real)
make_inspect_method(complex)
make_inspect_method(raw)
make_inspect_method(list)
#undef make_inspect_method

#define make_dataptr_method(class, accessor) \
static void * alt##class##_Dataptr(SEXP x, Rboolean writable) { \
(void)writable; \
return (void*)accessor(R_altrep_data1(x)); \
}
make_dataptr_method(string, STRING_PTR_RO)
make_dataptr_method(integer, INTEGER)
make_dataptr_method(logical, LOGICAL)
make_dataptr_method(real, REAL)
make_dataptr_method(complex, COMPLEX)
make_dataptr_method(raw, RAW)
make_dataptr_method(list, DATAPTR_RO) // VECTOR_PTR_RO to appear in R-4.5
#undef make_dataptr_method

static const void * altall_Dataptr_or_null(SEXP x) { return DATAPTR_RO(x); }

// lots of boilerplate, but R calling *_ELT one by one would be far too slow
#define make_extract_subset_method(class, type, accessor, NA) \
static SEXP alt##class##_Extract_subset(SEXP x, SEXP indx, SEXP call) { \
(void)call; \
indx = PROTECT(coerceVector(indx, REALSXP)); \
double * ii = REAL(indx); \
R_xlen_t rlen = XLENGTH(indx), mylen = XLENGTH(x); \
SEXP ret = PROTECT(allocVector(TYPEOF(x), rlen)); \
type *rdata = accessor(ret), *mydata = accessor(x); \
for (R_xlen_t i = 0; i < rlen; ++i) \
rdata[i] = (ii[i] >= 1 && ii[i] <= mylen) ? mydata[(R_xlen_t)ii[i]-1] : NA; \
UNPROTECT(2); \
return ret; \
}
make_extract_subset_method(integer, int, INTEGER, NA_INTEGER)
make_extract_subset_method(logical, int, LOGICAL, NA_LOGICAL)
make_extract_subset_method(real, double, REAL, NA_REAL)
make_extract_subset_method(complex, Rcomplex, COMPLEX, NA_COMPLEX)
make_extract_subset_method(raw, Rbyte, RAW, 0)
// not implementing the string and list methods because those do require the write barrier and are thus no better than calling *_ELT one by one
#undef make_extract_subset_method

#define make_elt_method(class, accessor) \
static SEXP alt##class##_Elt(SEXP x, R_xlen_t i) { \
return accessor(R_altrep_data1(x), i); \
}
make_elt_method(string, STRING_ELT)
make_elt_method(list, VECTOR_ELT)
#undef make_elt_method

#define make_set_elt_method(class, accessor) \
static void alt##class##_Set_elt(SEXP x, R_xlen_t i, SEXP v) { \
accessor(R_altrep_data1(x), i, v); \
}
make_set_elt_method(string, SET_STRING_ELT)
make_set_elt_method(list, SET_VECTOR_ELT)
#undef make_set_elt_method

// liked the Extract_subset methods? say hello to Get_region
#define make_get_region_method(class, type, accessor) \
static R_xlen_t alt##class##_Get_region( \
SEXP x, R_xlen_t i, R_xlen_t n, type * buf \
) { \
R_xlen_t j = 0, mylen = XLENGTH(x); \
type * data = accessor(x); \
for (; j < n && i < mylen; ++i, ++j) buf[j] = data[i]; \
return j; \
}
make_get_region_method(integer, int, INTEGER)
make_get_region_method(logical, int, LOGICAL)
make_get_region_method(real, double, REAL)
make_get_region_method(complex, Rcomplex, COMPLEX)
make_get_region_method(raw, Rbyte, RAW)
#undef make_get_region_method

void register_altrep_classes(DllInfo * info) {
// Used by the altcomplex_Extract_subset method
NA_COMPLEX = (Rcomplex){ .r = NA_REAL, .i = NA_REAL };

dta_grow_string = R_make_altstring_class("growable_string_v0", "data.table", info);
R_set_altrep_Length_method(dta_grow_string, altall_Length);
R_set_altrep_Inspect_method(dta_grow_string, altstring_Inspect);
R_set_altvec_Dataptr_method(dta_grow_string, altstring_Dataptr);
R_set_altvec_Dataptr_or_null_method(dta_grow_string, altall_Dataptr_or_null);
R_set_altstring_Elt_method(dta_grow_string, altstring_Elt);
R_set_altstring_Set_elt_method(dta_grow_string, altstring_Set_elt);
dta_grow_integer = R_make_altinteger_class("growable_integer_v0", "data.table", info);
R_set_altrep_Length_method(dta_grow_integer, altall_Length);
R_set_altrep_Inspect_method(dta_grow_integer, altinteger_Inspect);
R_set_altvec_Dataptr_method(dta_grow_integer, altinteger_Dataptr);
R_set_altvec_Dataptr_or_null_method(dta_grow_integer, altall_Dataptr_or_null);
R_set_altvec_Extract_subset_method(dta_grow_integer, altinteger_Extract_subset);
R_set_altinteger_Get_region_method(dta_grow_integer, altinteger_Get_region);
dta_grow_logical = R_make_altlogical_class("growable_logical_v0", "data.table", info);
R_set_altrep_Length_method(dta_grow_logical, altall_Length);
R_set_altrep_Inspect_method(dta_grow_logical, altlogical_Inspect);
R_set_altvec_Dataptr_method(dta_grow_logical, altlogical_Dataptr);
R_set_altvec_Dataptr_or_null_method(dta_grow_logical, altall_Dataptr_or_null);
R_set_altvec_Extract_subset_method(dta_grow_logical, altlogical_Extract_subset);
R_set_altlogical_Get_region_method(dta_grow_logical, altlogical_Get_region);
dta_grow_real = R_make_altreal_class("growable_real_v0", "data.table", info);
R_set_altrep_Length_method(dta_grow_real, altall_Length);
R_set_altrep_Inspect_method(dta_grow_real, altreal_Inspect);
R_set_altvec_Dataptr_method(dta_grow_real, altreal_Dataptr);
R_set_altvec_Dataptr_or_null_method(dta_grow_real, altall_Dataptr_or_null);
R_set_altvec_Extract_subset_method(dta_grow_real, altreal_Extract_subset);
R_set_altreal_Get_region_method(dta_grow_real, altreal_Get_region);
dta_grow_complex = R_make_altcomplex_class("growable_complex_v0", "data.table", info);
R_set_altrep_Length_method(dta_grow_complex, altall_Length);
R_set_altrep_Inspect_method(dta_grow_complex, altcomplex_Inspect);
R_set_altvec_Dataptr_method(dta_grow_complex, altcomplex_Dataptr);
R_set_altvec_Dataptr_or_null_method(dta_grow_complex, altall_Dataptr_or_null);
R_set_altvec_Extract_subset_method(dta_grow_complex, altcomplex_Extract_subset);
R_set_altcomplex_Get_region_method(dta_grow_complex, altcomplex_Get_region);
dta_grow_raw = R_make_altraw_class("growable_raw_v0", "data.table", info);
R_set_altrep_Length_method(dta_grow_raw, altall_Length);
R_set_altrep_Inspect_method(dta_grow_raw, altraw_Inspect);
R_set_altvec_Dataptr_method(dta_grow_raw, altraw_Dataptr);
R_set_altvec_Dataptr_or_null_method(dta_grow_raw, altall_Dataptr_or_null);
R_set_altvec_Extract_subset_method(dta_grow_raw, altraw_Extract_subset);
R_set_altraw_Get_region_method(dta_grow_raw, altraw_Get_region);
dta_grow_list = R_make_altlist_class("growable_list_v0", "data.table", info);
R_set_altrep_Length_method(dta_grow_list, altall_Length);
R_set_altrep_Inspect_method(dta_grow_list, altlist_Inspect);
R_set_altvec_Dataptr_method(dta_grow_list, altlist_Dataptr);
R_set_altvec_Dataptr_or_null_method(dta_grow_list, altall_Dataptr_or_null);
R_set_altlist_Elt_method(dta_grow_list, altlist_Elt);
R_set_altlist_Set_elt_method(dta_grow_list, altlist_Set_elt);
}

static R_altrep_class_t dta_grow_string, dta_grow_integer, dta_grow_logical, dta_grow_real, dta_grow_complex, dta_grow_raw, dta_grow_list;

static R_altrep_class_t type2class(SEXPTYPE type) {
switch(type) {
case STRSXP:
return dta_grow_string;
case INTSXP:
return dta_grow_integer;
case LGLSXP:
return dta_grow_logical;
case REALSXP:
return dta_grow_real;
case CPLXSXP:
return dta_grow_complex;
case RAWSXP:
return dta_grow_raw;
case VECSXP:
case EXPRSXP:
return dta_grow_list;
default:
internal_error(__func__, "Can't create a growable vector of type '%s'", type2char(type));
}
}

SEXP growable_allocate(SEXPTYPE type, R_xlen_t size, R_xlen_t max_size) {
SEXP ret = PROTECT(R_new_altrep(type2class(type), R_NilValue, R_NilValue));
R_set_altrep_data1(ret, allocVector(type, max_size));
R_set_altrep_data2(ret, ScalarReal(size));
UNPROTECT(1);
return ret;
}

R_xlen_t growable_max_size(SEXP x) {
return XLENGTH(R_altrep_data1(x));
}

void growable_resize(SEXP x, R_xlen_t newsize) {
R_xlen_t max_size;
if (newsize > (max_size = growable_max_size(x))) internal_error(
__func__, "newsize=%g > max_size=%g",
(double)newsize, (double)max_size
);
REAL(R_altrep_data2(x))[0] = newsize;
}

Rboolean is_growable(SEXP x) {
switch(TYPEOF(x)) {
case STRSXP:
case INTSXP:
case LGLSXP:
case REALSXP:
case CPLXSXP:
case RAWSXP:
case VECSXP:
return R_altrep_inherits(x, type2class(TYPEOF(x)));
default:
return FALSE;
}
}

SEXP make_growable(SEXP x) {
SEXP ret = PROTECT(R_new_altrep(type2class(TYPEOF(x)), R_NilValue, R_NilValue));
R_set_altrep_data1(ret, x);
R_set_altrep_data2(ret, ScalarReal(XLENGTH(x)));
SHALLOW_DUPLICATE_ATTRIB(ret, x);
UNPROTECT(1);
return ret;
}

#endif
4 changes: 4 additions & 0 deletions src/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -203,8 +203,12 @@ void attribute_visible R_init_data_table(DllInfo *info)

SEXP tmp = PROTECT(allocVector(INTSXP,2));
if (LENGTH(tmp)!=2) error(_("Checking LENGTH(allocVector(INTSXP,2)) [%d] is 2 %s"), LENGTH(tmp), msg);
#if R_VERSION >= R_Version(4, 3, 0)
register_altrep_classes(info);
#else
// Use (long long) to cast R_xlen_t to a fixed type to robustly avoid -Wformat compiler warnings, see #5768
if (TRUELENGTH(tmp)!=0) error(_("Checking TRUELENGTH(allocVector(INTSXP,2)) [%lld] is 0 %s"), (long long)TRUELENGTH(tmp), msg);
#endif
UNPROTECT(1);

// According to IEEE (http://en.wikipedia.org/wiki/IEEE_754-1985#Zero) we can rely on 0.0 being all 0 bits.
Expand Down
6 changes: 6 additions & 0 deletions src/reorder.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,17 @@ SEXP reorder(SEXP x, SEXP order)
error(_("Column %d is length %d which differs from length of column 1 (%d). Invalid data.table."), i+1, length(v), nrow);
if (SIZEOF(v) > maxSize)
maxSize=SIZEOF(v);
#ifndef USE_GROWABLE_ALTREP
if (ALTREP(v)) SET_VECTOR_ELT(x, i, copyAsPlain(v));
#endif
}
copySharedColumns(x); // otherwise two columns which point to the same vector would be reordered and then re-reordered, issues linked in PR#3768
} else {
if (SIZEOF(x)!=4 && SIZEOF(x)!=8 && SIZEOF(x)!=16 && SIZEOF(x)!=1)
error(_("reorder accepts vectors but this non-VECSXP is type '%s' which isn't yet supported (SIZEOF=%zu)"), type2char(TYPEOF(x)), SIZEOF(x));
#ifndef USE_GROWABLE_ALTREP
if (ALTREP(x)) internal_error(__func__, "cannot reorder an ALTREP vector. Please see NEWS item 2 in v1.11.4"); // # nocov
#endif
maxSize = SIZEOF(x);
nrow = length(x);
ncol = 1;
Expand All @@ -40,7 +44,9 @@ SEXP reorder(SEXP x, SEXP order)
if (length(order) != nrow)
error("nrow(x)[%d]!=length(order)[%d]", nrow, length(order)); // # notranslate
int nprotect = 0;
#ifndef USE_GROWABLE_ALTREP
if (ALTREP(order)) { order=PROTECT(copyAsPlain(order)); nprotect++; } // TODO: if it's an ALTREP sequence some optimizations are possible rather than expand
#endif

const int *restrict idx = INTEGER(order);
int i=0;
Expand Down
11 changes: 10 additions & 1 deletion src/utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,9 @@ inline bool INHERITS(SEXP x, SEXP char_) {
return false;
}

#ifdef USE_GROWABLE_ALTREP
SEXP copyAsPlain(SEXP x) { return duplicate(x); }
#else
SEXP copyAsPlain(SEXP x) {
// v1.12.2 and before used standard R duplicate() to do this. But duplicate() is not guaranteed to not return an ALTREP.
// e.g. ALTREP 'wrapper' on factor column (with materialized INTSXP) in package VIM under example(hotdeck)
Expand Down Expand Up @@ -256,6 +259,7 @@ SEXP copyAsPlain(SEXP x) {
UNPROTECT(1);
return ans;
}
#endif

void copySharedColumns(SEXP x) {
const int ncol = length(x);
Expand All @@ -266,7 +270,12 @@ void copySharedColumns(SEXP x) {
int nShared=0;
for (int i=0; i<ncol; ++i) {
SEXP thiscol = xp[i];
if (ALTREP(thiscol) || hash_lookup(marks, thiscol, 0)<0) {
if (
hash_lookup(marks, thiscol, 0)<0
#ifndef USE_GROWABLE_ALTREP
|| ALTREP(thiscol)
#endif
) {
shared[i] = true; // we mark ALTREP as 'shared' too, whereas 'tocopy' would be better word to use for ALTREP
nShared++;
} else {
Expand Down
Loading

0 comments on commit 61fdc06

Please sign in to comment.