From bbee709e6d6660e98401f2330eaf233d614c9ec4 Mon Sep 17 00:00:00 2001 From: Mauricio 'Pacha' Vargas Sepulveda Date: Wed, 1 Jan 2025 04:38:32 -0500 Subject: [PATCH 1/3] fix #406 --- cpp11test/DESCRIPTION | 2 +- inst/include/cpp11/r_vector.hpp | 4 ++- inst/include/cpp11/strings.hpp | 51 +++++++++++++++++++++++---------- 3 files changed, 40 insertions(+), 17 deletions(-) diff --git a/cpp11test/DESCRIPTION b/cpp11test/DESCRIPTION index d1d05665..70c5649f 100644 --- a/cpp11test/DESCRIPTION +++ b/cpp11test/DESCRIPTION @@ -20,4 +20,4 @@ Suggests: xml2 LazyData: true Roxygen: list(markdown = TRUE) -RoxygenNote: 7.1.1 +RoxygenNote: 7.3.2 diff --git a/inst/include/cpp11/r_vector.hpp b/inst/include/cpp11/r_vector.hpp index 576f4fe6..c27f54e9 100644 --- a/inst/include/cpp11/r_vector.hpp +++ b/inst/include/cpp11/r_vector.hpp @@ -235,7 +235,9 @@ class r_vector : public cpp11::r_vector { proxy at(const r_string& name) const; void push_back(T value); - /// Implemented in `strings.hpp` + template ::value>::type* = nullptr> + void push_back(const std::string& value); // Pacha: r_string only (#406) void push_back(const named_arg& value); void pop_back(); diff --git a/inst/include/cpp11/strings.hpp b/inst/include/cpp11/strings.hpp index 393f2a8c..cd279192 100644 --- a/inst/include/cpp11/strings.hpp +++ b/inst/include/cpp11/strings.hpp @@ -61,12 +61,32 @@ typedef r_vector strings; namespace writable { template <> -inline void r_vector::set_elt(SEXP x, R_xlen_t i, - typename r_vector::underlying_type value) { +inline void r_vector::set_elt( + SEXP x, R_xlen_t i, typename r_vector::underlying_type value) { // NOPROTECT: Likely too costly to unwind protect every set elt SET_STRING_ELT(x, i, value); } +// Pacha: Optimized subscript assignment for std::string +template <> +inline typename r_vector::proxy r_vector::operator[]( + R_xlen_t i) const { + return typename r_vector::proxy(data_, i, nullptr, false); +} + +// Pacha: Optimized push_back for std::string +template <> +template ::value>::type*> +inline void r_vector::push_back(const std::string& value) { + while (this->length_ >= this->capacity_) { + this->reserve(this->capacity_ == 0 ? 1 : this->capacity_ * 2); + } + + SEXP char_sexp = Rf_mkCharLenCE(value.c_str(), value.size(), CE_UTF8); + SET_STRING_ELT(this->data_, this->length_, char_sexp); + ++this->length_; +} + inline bool operator==(const r_vector::proxy& lhs, r_string rhs) { return static_cast(lhs).operator==(static_cast(rhs).c_str()); } @@ -95,17 +115,17 @@ inline SEXP alloc_if_charsxp(const SEXP data) { template <> inline r_vector::r_vector(const SEXP& data) - : cpp11::r_vector(alloc_or_copy(data)), capacity_(length_) { + : cpp11::r_vector(alloc_or_copy(data)), capacity_(this->length_) { if (detail::r_typeof(data) == CHARSXP) { - SET_STRING_ELT(data_, 0, data); + SET_STRING_ELT(this->data_, 0, data); } } template <> inline r_vector::r_vector(SEXP&& data) - : cpp11::r_vector(alloc_if_charsxp(data)), capacity_(length_) { + : cpp11::r_vector(alloc_if_charsxp(data)), capacity_(this->length_) { if (detail::r_typeof(data) == CHARSXP) { - SET_STRING_ELT(data_, 0, data); + SET_STRING_ELT(this->data_, 0, data); } } @@ -117,14 +137,15 @@ inline r_vector::r_vector(std::initializer_list il) unwind_protect([&] { auto it = il.begin(); - for (R_xlen_t i = 0; i < capacity_; ++i, ++it) { + for (R_xlen_t i = 0; i < this->capacity_; ++i, ++it) { // i.e. to `SEXP` - underlying_type elt = static_cast(*it); + typename r_vector::underlying_type elt = + static_cast::underlying_type>(*it); if (elt == NA_STRING) { - set_elt(data_, i, elt); + set_elt(this->data_, i, elt); } else { - set_elt(data_, i, Rf_mkCharCE(Rf_translateCharUTF8(elt), CE_UTF8)); + set_elt(this->data_, i, Rf_mkCharCE(Rf_translateCharUTF8(elt), CE_UTF8)); } } }); @@ -135,12 +156,12 @@ typedef r_vector strings; template inline void r_vector::push_back(const named_arg& value) { push_back(value.value()); - if (Rf_xlength(names()) == 0) { - cpp11::writable::strings new_nms(size()); - names() = new_nms; + if (Rf_xlength(this->names()) == 0) { + cpp11::writable::strings new_nms(this->size()); + this->names() = new_nms; } - cpp11::writable::strings nms(names()); - nms[size() - 1] = value.name(); + cpp11::writable::strings nms(this->names()); + nms[this->size() - 1] = value.name(); } } // namespace writable From 496dd6ba2ecfd212d8564d8cd46657971b79b0f3 Mon Sep 17 00:00:00 2001 From: Mauricio 'Pacha' Vargas Sepulveda Date: Wed, 1 Jan 2025 05:37:35 -0500 Subject: [PATCH 2/3] borrow from @traversc's push_back_fast --- inst/include/cpp11/strings.hpp | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/inst/include/cpp11/strings.hpp b/inst/include/cpp11/strings.hpp index cd279192..fdf84405 100644 --- a/inst/include/cpp11/strings.hpp +++ b/inst/include/cpp11/strings.hpp @@ -67,23 +67,15 @@ inline void r_vector::set_elt( SET_STRING_ELT(x, i, value); } -// Pacha: Optimized subscript assignment for std::string -template <> -inline typename r_vector::proxy r_vector::operator[]( - R_xlen_t i) const { - return typename r_vector::proxy(data_, i, nullptr, false); -} - -// Pacha: Optimized push_back for std::string +// Pacha: Optimized push_back for std::string (borrows from @traversc' push_back_fast) template <> template ::value>::type*> inline void r_vector::push_back(const std::string& value) { while (this->length_ >= this->capacity_) { this->reserve(this->capacity_ == 0 ? 1 : this->capacity_ * 2); } - - SEXP char_sexp = Rf_mkCharLenCE(value.c_str(), value.size(), CE_UTF8); - SET_STRING_ELT(this->data_, this->length_, char_sexp); + set_elt(this->data_, this->length_, + Rf_mkCharLenCE(value.c_str(), value.size(), CE_UTF8)); ++this->length_; } From 008ab7c199b51a03f3ea8d6be4aadb22cea76b48 Mon Sep 17 00:00:00 2001 From: Mauricio 'Pacha' Vargas Sepulveda Date: Wed, 1 Jan 2025 05:37:48 -0500 Subject: [PATCH 3/3] add benchmarks --- cpp11test/R/cpp11.R | 20 +++++++++ cpp11test/bench/strings.R | 35 +++++++++++++++ cpp11test/src/cpp11.cpp | 40 +++++++++++++++++ cpp11test/src/strings.cpp | 94 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 189 insertions(+) create mode 100644 cpp11test/bench/strings.R diff --git a/cpp11test/R/cpp11.R b/cpp11test/R/cpp11.R index 038e7b76..da4588eb 100644 --- a/cpp11test/R/cpp11.R +++ b/cpp11test/R/cpp11.R @@ -168,6 +168,26 @@ string_push_back_ <- function() { .Call(`_cpp11test_string_push_back_`) } +grow_strings_cpp11_ <- function(n, seed) { + .Call(`_cpp11test_grow_strings_cpp11_`, n, seed) +} + +grow_strings_rcpp_ <- function(n, seed) { + .Call(`_cpp11test_grow_strings_rcpp_`, n, seed) +} + +grow_strings_manual_ <- function(n, seed) { + .Call(`_cpp11test_grow_strings_manual_`, n, seed) +} + +assign_cpp11_ <- function(n, seed) { + .Call(`_cpp11test_assign_cpp11_`, n, seed) +} + +assign_rcpp_ <- function(n, seed) { + .Call(`_cpp11test_assign_rcpp_`, n, seed) +} + sum_dbl_for_ <- function(x) { .Call(`_cpp11test_sum_dbl_for_`, x) } diff --git a/cpp11test/bench/strings.R b/cpp11test/bench/strings.R new file mode 100644 index 00000000..e497f68d --- /dev/null +++ b/cpp11test/bench/strings.R @@ -0,0 +1,35 @@ +pkgload::load_all("cpp11test") + +bench::press(len = as.integer(10^(0:6)), { + bench::mark( + assign_cpp11_(n = len, 123L), + assign_rcpp_(n = len, 123L), + iterations = 20 + ) +})[c("expression", "len", "min", "mem_alloc", "n_itr", "n_gc")] + +# Longer benchmark, lots of gc +len <- as.integer(10^7) +bench::mark( + cpp11 = cpp11_push_and_truncate_(len), + rcpp = rcpp_push_and_truncate_(len), + min_iterations = 200 +)[c("expression", "min", "mem_alloc", "n_itr", "n_gc")] + +bench::press(len = as.integer(10^(0:6)), { + bench::mark( + grow_strings_cpp11_(len, 123L), + grow_strings_rcpp_(len, 123L), + grow_strings_manual_(len, 123L), + iterations = 20 + ) +})[c("expression", "len", "min", "mem_alloc", "n_itr", "n_gc")] + +# Longer benchmark, lots of gc +len <- as.integer(10^7) +bench::mark( + cpp11 = cpp11_grow_strings_(len), + rcpp = rcpp_grow_strings_(len), + manual = manual_grow_strings_(len), + min_iterations = 200 +)[c("expression", "min", "mem_alloc", "n_itr", "n_gc")] diff --git a/cpp11test/src/cpp11.cpp b/cpp11test/src/cpp11.cpp index 421de637..df3a428e 100644 --- a/cpp11test/src/cpp11.cpp +++ b/cpp11test/src/cpp11.cpp @@ -324,6 +324,41 @@ extern "C" SEXP _cpp11test_string_push_back_() { return cpp11::as_sexp(string_push_back_()); END_CPP11 } +// strings.cpp +cpp11::strings grow_strings_cpp11_(size_t n, int seed); +extern "C" SEXP _cpp11test_grow_strings_cpp11_(SEXP n, SEXP seed) { + BEGIN_CPP11 + return cpp11::as_sexp(grow_strings_cpp11_(cpp11::as_cpp>(n), cpp11::as_cpp>(seed))); + END_CPP11 +} +// strings.cpp +Rcpp::CharacterVector grow_strings_rcpp_(size_t n, int seed); +extern "C" SEXP _cpp11test_grow_strings_rcpp_(SEXP n, SEXP seed) { + BEGIN_CPP11 + return cpp11::as_sexp(grow_strings_rcpp_(cpp11::as_cpp>(n), cpp11::as_cpp>(seed))); + END_CPP11 +} +// strings.cpp +SEXP grow_strings_manual_(size_t n, int seed); +extern "C" SEXP _cpp11test_grow_strings_manual_(SEXP n, SEXP seed) { + BEGIN_CPP11 + return cpp11::as_sexp(grow_strings_manual_(cpp11::as_cpp>(n), cpp11::as_cpp>(seed))); + END_CPP11 +} +// strings.cpp +cpp11::strings assign_cpp11_(size_t n, int seed); +extern "C" SEXP _cpp11test_assign_cpp11_(SEXP n, SEXP seed) { + BEGIN_CPP11 + return cpp11::as_sexp(assign_cpp11_(cpp11::as_cpp>(n), cpp11::as_cpp>(seed))); + END_CPP11 +} +// strings.cpp +Rcpp::CharacterVector assign_rcpp_(size_t n, int seed); +extern "C" SEXP _cpp11test_assign_rcpp_(SEXP n, SEXP seed) { + BEGIN_CPP11 + return cpp11::as_sexp(assign_rcpp_(cpp11::as_cpp>(n), cpp11::as_cpp>(seed))); + END_CPP11 +} // sum.cpp double sum_dbl_for_(cpp11::doubles x); extern "C" SEXP _cpp11test_sum_dbl_for_(SEXP x) { @@ -472,6 +507,8 @@ extern "C" { extern SEXP run_testthat_tests(SEXP); static const R_CallMethodDef CallEntries[] = { + {"_cpp11test_assign_cpp11_", (DL_FUNC) &_cpp11test_assign_cpp11_, 2}, + {"_cpp11test_assign_rcpp_", (DL_FUNC) &_cpp11test_assign_rcpp_, 2}, {"_cpp11test_col_sums", (DL_FUNC) &_cpp11test_col_sums, 1}, {"_cpp11test_cpp11_add_vec_for_", (DL_FUNC) &_cpp11test_cpp11_add_vec_for_, 2}, {"_cpp11test_cpp11_insert_", (DL_FUNC) &_cpp11test_cpp11_insert_, 1}, @@ -488,6 +525,9 @@ static const R_CallMethodDef CallEntries[] = { {"_cpp11test_gibbs_rcpp", (DL_FUNC) &_cpp11test_gibbs_rcpp, 2}, {"_cpp11test_gibbs_rcpp2", (DL_FUNC) &_cpp11test_gibbs_rcpp2, 2}, {"_cpp11test_grow_", (DL_FUNC) &_cpp11test_grow_, 1}, + {"_cpp11test_grow_strings_cpp11_", (DL_FUNC) &_cpp11test_grow_strings_cpp11_, 2}, + {"_cpp11test_grow_strings_manual_", (DL_FUNC) &_cpp11test_grow_strings_manual_, 2}, + {"_cpp11test_grow_strings_rcpp_", (DL_FUNC) &_cpp11test_grow_strings_rcpp_, 2}, {"_cpp11test_my_message", (DL_FUNC) &_cpp11test_my_message, 2}, {"_cpp11test_my_message_n1", (DL_FUNC) &_cpp11test_my_message_n1, 1}, {"_cpp11test_my_message_n1fmt", (DL_FUNC) &_cpp11test_my_message_n1fmt, 1}, diff --git a/cpp11test/src/strings.cpp b/cpp11test/src/strings.cpp index 1db7736b..e19188dc 100644 --- a/cpp11test/src/strings.cpp +++ b/cpp11test/src/strings.cpp @@ -1,4 +1,8 @@ #include "cpp11/strings.hpp" +#include +#include + +#include // Test benchmark for string proxy assignment performance. // We don't unwind_protect() before each `SET_STRING_ELT()` call, @@ -33,3 +37,93 @@ return x; } + +// issue 406 + +std::random_device rd; +std::mt19937 gen(rd()); + +double random_double() { + std::uniform_real_distribution dist(0.0, 1.0); + return dist(gen); +} + +int random_int(int min, int max) { + std::uniform_int_distribution dist(min, max); + return dist(gen); +} + +std::string random_string() { + std::string s(10, '\0'); + for (size_t i = 0; i < 10; i++) { + s[i] = random_int(0, 25) + 'a'; + } + return s; +} + +[[cpp11::register]] cpp11::strings grow_strings_cpp11_(size_t n, int seed) { + gen.seed(seed); + cpp11::writable::strings x; + for (size_t i = 0; i < n; ++i) { + x.push_back(random_string()); + } + return x; +} + +[[cpp11::register]] Rcpp::CharacterVector grow_strings_rcpp_(size_t n, int seed) { + gen.seed(seed); + Rcpp::CharacterVector x(n); + for (size_t i = 0; i < n; ++i) { + x[i] = random_string(); + } + return x; +} + +[[cpp11::register]] SEXP grow_strings_manual_(size_t n, int seed) { + gen.seed(seed); + SEXP data_ = PROTECT(Rf_allocVector(STRSXP, 0)); + size_t size_ = 0; + size_t capacity_ = 0; + for (size_t i = 0; i < n; ++i) { + if (size_ == capacity_) { + capacity_ = capacity_ == 0 ? 1 : capacity_ * 2; + SEXP new_data_ = PROTECT(Rf_allocVector(STRSXP, capacity_)); + for (size_t j = 0; j < size_; ++j) { + SET_STRING_ELT(new_data_, j, STRING_ELT(data_, j)); + } + UNPROTECT(2); + data_ = PROTECT(new_data_); + } + SET_STRING_ELT(data_, size_++, Rf_mkChar(random_string().c_str())); + } + // copy back down to size + if (size_ < capacity_) { + SEXP new_data_ = PROTECT(Rf_allocVector(STRSXP, size_)); + for (size_t j = 0; j < size_; ++j) { + SET_STRING_ELT(new_data_, j, STRING_ELT(data_, j)); + } + UNPROTECT(2); + return new_data_; + } else { + UNPROTECT(1); + return data_; + } +} + +[[cpp11::register]] cpp11::strings assign_cpp11_(size_t n, int seed) { + gen.seed(seed); + cpp11::writable::strings x(n); + for (size_t i = 0; i < n; ++i) { + x[i] = random_string(); + } + return x; +} + +[[cpp11::register]] Rcpp::CharacterVector assign_rcpp_(size_t n, int seed) { + gen.seed(seed); + Rcpp::CharacterVector x(n); + for (size_t i = 0; i < n; ++i) { + x[i] = random_string(); + } + return x; +}