library(keras)
-install_keras()
library(keras3)
+::install_python()
+ reticulateinstall_keras()
diff --git a/.nojekyll b/.nojekyll index cadad4d3..09174f77 100644 --- a/.nojekyll +++ b/.nojekyll @@ -1 +1 @@ -09c6edb9 \ No newline at end of file +9c9b93e0 \ No newline at end of file diff --git a/html/images/logo-sparklyr.png b/html/images/logo-sparklyr.png index bf7dd9ce..52fcbc75 100644 Binary files a/html/images/logo-sparklyr.png and b/html/images/logo-sparklyr.png differ diff --git a/html/images/logo-tensorflow.png b/html/images/logo-tensorflow.png index ddb17bcb..c0ecf986 100644 Binary files a/html/images/logo-tensorflow.png and b/html/images/logo-tensorflow.png differ diff --git a/html/keras.html b/html/keras.html index d4cfdbf1..5626d748 100644 --- a/html/keras.html +++ b/html/keras.html @@ -162,30 +162,30 @@
Keras is a high-level neural networks API developed with a focus on enabling fast experimentation. It supports multiple back-ends, including TensorFlow, CNTK and Theano.
-TensorFlow is a lower level mathematical library for building deep neural network architectures. The keras R package makes it easy to use Keras and TensorFlow in R.
+Keras is a high-level neural networks API developed with a focus on enabling fast experimentation. It supports multiple back-ends, including TensorFlow, Jax and Torch.
+Backends like TensorFlow are lower level mathematical libraries for building deep neural network architectures. The keras3 R package makes it easy to use Keras with any backend in R.
Read more at:
-https://tensorflow.rstudio.com
+https://keras.posit.co
https://www.manning.com/books/deep-learning-with-r-second-edition
The keras R package uses the Python keras library. You can install all the prerequisites directly from R: https://tensorflow.rstudio.com/install.
+The keras3 R package uses the Python keras library. You can install all the prerequisites directly from R See ?keras3::install_keras
for details and options.
library(keras)
-install_keras()
library(keras3)
+::install_python()
+ reticulateinstall_keras()
See ?install_keras
for GPU instructions.
This installs the required libraries in an Anaconda environment or virtual environment r-tensorflow
.
This installs the required libraries in virtual environment named ‘r-keras’. It will automatically detect if a GPU is available.
# input layer: use MNIST images
<- dataset_mnist()
- mnist <- mnist$train$x
- x_train <- mnist$train$y
- y_train <- mnist$test$x
- x_test <- mnist$test$y
- y_test
-# reshape and rescale
-<- array_reshape(x_train, c(nrow(x_train), 784))
- x_train <- array_reshape(x_test, c(nrow(x_test), 784))
- x_test <- x_train / 255
- x_train <- x_test / 255
+ x_test <- mnist$train$x; y_train <- mnist$train$y
+ x_train <- mnist$test$x; y_test <- mnist$test$y
+ x_test
+# reshape and rescale
+<- array_reshape(x_train, c(nrow(x_train), 784))
+ x_train <- array_reshape(x_test, c(nrow(x_test), 784))
+ x_test <- x_train / 255; x_test <- x_test / 255
+ x_train
+<- to_categorical(y_train, 10)
+ y_train <- to_categorical(y_test, 10)
y_test
-<- to_categorical(y_train, 10)
- y_train <- to_categorical(y_test, 10)
- y_test
-# defining the model and layers
-<- keras_model_sequential()
- model %>%
- model layer_dense(units = 256, activation = 'relu', input_shape = c(784)) %>%
- layer_dropout(rate = 0.4) %>%
- layer_dense(units = 128, activation = 'relu') %>%
- layer_dense(units = 10, activation = 'softmax')
-
- # compile (define loss and optimizer)
-%>%
- model compile(
- loss = 'categorical_crossentropy',
- optimizer = optimizer_rmsprop(),
- metrics = c('accuracy')
-
+ )# defining the model and layers
+<-
+ model keras_model_sequential(input_shape = c(28, 28, 1))
+ |>
+ model layer_conv_2d(filters = 32, kernel_size = c(3, 3),
+ activation = "relu") |>
+ layer_max_pooling_2d(pool_size = c(2, 2)) |>
+ layer_conv_2d(filters = 64, kernel_size = c(3, 3),
+ activation = "relu") |>
+ layer_max_pooling_2d(pool_size = c(2, 2)) |>
+ layer_flatten() |>
+ layer_dropout(rate = 0.5) |>
+ layer_dense(units = num_classes,
+ activation = "softmax")
+
+# View the model summary
+summary(model)
+plot(model)
-# train (fit)
-%>% fit(
- model
- x_train, y_train,epochs = 30, batch_size = 128,
- validation_split = 0.2
-
- )
-%>% evaluate(x_test, y_test)
- model %>% predict_classes(x_test) model
keras_model()
: Keras Model.
keras_model_sequential()
: Keras Model composed of a linear stack of layers.
multi_gpu_model()
: Replicates a model on different GPUs.
keras_input()
and keras_model()
Define a Functional Model with inputs and outputs.
+<- keras_input(<input-shape>)
+ inputs <- inputs |>
+ outputs layer_dense() |> layer_...
+ <- keras_model(inputs, outputs) model
keras_model_sequential()
Define a Sequential Model composed of a linear stack of layers
+<-
+ model keras_model_sequential(<input-shape>) |>
+ layer_dense() |> layer_...
Model()
Subclass the base Model class
+compile(object, optimizer, loss, metrics = NULL)
: Configure a Keras model for training.compile(object, optimizer, loss, metrics = NULL)
: Configure a Keras model for training.
fit(object, x = NULL, y = NULL, batch_size = NULL, epochs = 10, verbose = 1, callbacks = NULL, ...)
: Train a Keras model for a fixed number of epochs (iterations)
Customize training:
+fit()
:Callback()
.train_on_batch()
in a custom training loop.Model()
and implement a custom train_step
method.model$optimizer$apply(gradients, weights)
fit(object, x = NULL, y = NULL, batch_size = NULL, epochs = 10, verbose = 1, callbacks = NULL, ...)
: Train a Keras model for a fixed number of epochs (iterations).
fit_generator()
: Fits the model on data yielded batch-by-batch by a generator.
train_on_batch()
; test_on_batch()
: Single gradient update or model evaluation over one batch of samples.
print(model)
: Print a summary of a Keras model
plot(model, show_shapes = FALSE, show_dtype = FALSE, show_layer_names = FALSE, ...)
: Plot a Keras model
evaluate(object, x = NULL, y = NULL, batch_size = NULL)
: Evaluate a Keras model.
evaluate_generator()
: Evaluates the model on a data generator.
evaluate(object, x = NULL, y = NULL, batch_size = NULL)
: Evaluate a Keras model.predict()
: Generate predictions from a Keras model.
predict_proba()
; predict_classes()
: Generates probability or class probability predictions for the input samples.
predict_on_batch()
: Returns predictions for a single batch of samples.
predict_generator()
: Generates predictions for the input samples from a data generator.
summary()
: Print a summary of a Keras model.
export_savedmodel()
: Export a saved model.
get_layer()
: Retrieves a layer based on either its name (unique) or index.
pop_layer()
: Remove the last layer in a model.
save_model_hdf5()
; load_model_hdf5()
: Save/Load models using HDF5 files.
serialize_model()
; unserialize_model()
: Serialize a model to an R object.
clone_model()
: Clone a model instance.
freeze_weights()
; unfreeze_weights()
save_model()
; load_model()
: Save/Load models using the “.keras” file format.
save_model_weights()
; load_model_weights()
: Save/load model weights to/from “.h5” files.
save_model_config()
; load_model_config()
: Save/load model architecture to/from a “.json” file.
layer_input()
: Input layer.
layer_dense()
: Add a densely-connected NN layer to an output.
layer_einsum_dense()
: Add a dense layer with arbitrary dimensionality.
layer_activation()
: Apply an activation function to an output.
layer_dropout()
: Applies Dropout to the input.
layer_reshape()
: Reshapes an output to a certain shape.
layer_global_average_pooling_1d()
; layer_global_average_pooling_2d()
; layer_global_average_pooling_3d()
: Global average pooling.
layer_activation(object, activation)
: Apply an activation function to an output.
layer_activation_leaky_relu()
: Leaky version of a rectified linear unit.
layer_activation_parametric_relu()
: Parametric rectified linear unit.
layer_activation_thresholded_relu()
: Thresholded rectified linear unit.
layer_activation_elu()
: Exponential linear unit.
image_dataset_from_directory()
Create a TF Dataset from image files in a directory.
image_load()
, image_from_array()
, image_to_array()
, image_array_save()
: Work with PIL Image instances
Operations that transform image tensors in deterministic ways.
+op_image_crop()
op_image_extract_patches()
op_image_pad()
op_image_resize()
op_image_affine_transform()
op_image_map_coordinates()
op_image_rgb_to_grayscale()
Resize images without aspect ratio distortion.
layer_dropout()
: Applies dropout to the input.
layer_spatial_dropout_1d()
; layer_spatial_dropout_2d()
; layer_spatial_dropout_3d()
: Spatial 1D to 3D version of dropout
image_smart_resize()
:Builtin image preprocessing layers. Note, any image operation function can also be used as a layer in a Model, or used in layer_lambda()
.
layer_simple_rnn()
: Fully-connected RNN where the output is to be fed back to input.
layer_gru()
: Gated recurrent unit - Cho et al.
layer_cudnn_gru()
: Fast GRU implementation backed by CuDNN.
layer_lstm()
: Long-Short Term Memory unit - Hochreiter 1997.
layer_cudnn_lstm()
: Fast LSTM implementation backed by CuDNN.
layer_resizing()
layer_rescaling()
layer_center_crop()
Preprocessing layers that randomly augment image inputs during training.
layer_locally_connected_1d()
; layer_locally_connected_2d()
: Similar to convolution, but weights are not shared, i.e. different filters for each patch.layer_random_crop()
layer_random_flip()
layer_random_translation()
layer_random_rotation()
layer_random_zoom()
layer_random_contrast()
layer_random_brightness()
pad_sequences()
: Pads each sequence to the same length (length of the longest sequence).
skipgrams()
: Generates skipgram word pairs.
make_sampling_table()
: Generates word rank-based probabilistic sampling table.
timeseries_dataset_from_array()
: Generate a TF Dataset of sliding windows over a timeseries provided as array.
audio_dataset_from_directory()
: Generate a TF Dataset from audio files.
pad_sequences()
: Pad sequences to the same length
text_tokenizer()
: Text tokenization utility.
fit_text_tokenizer()
: Update tokenizer internal vocabulary.
save_text_tokenizer()
; load_text_tokenizer()
: Save a text tokenizer to an external file.
texts_to_sequences()
; texts_to_sequences_generator()
: Transforms each text in texts to sequence of integers.
texts_to_matrix()
; sequences_to_matrix()
: Convert a list of sequences into a matrix.
text_one_hot()
: One-hot encode text to word indices.
text_hashing_trick()
: Converts a text to a sequence of indexes in a fixed-size hashing space.
text_to_word_sequence()
: Convert text to a sequence of words (or tokens).
text_dataset_from_directory()
: Generate a TF Dataset from text files in a directory.
layer_text_vectorization()
, get_vocabulary()
, set_vocabulary()
: Map text to integer sequences.
layer_normalization()
: Normalizes continuous features.
layer_discretization()
: Buckets continuous features by ranges.
layer_category_encoding()
: Encode integer features.
layer_hashing()
: Hash and bin categorical features.
layer_hashed_crossing()
: Cross features using the “hashing trick”.
layer_string_lookup()
: Map strings to (possibly encoded) indices.
layer_integer_lookup()
: Map integers to (possibly encoded) indices.
One-stop utility for preprocessing and encoding structured data. Define a feature space from a list of table columns (features).
+<- layer_feature_space(features = list(<features>)) feature_space
Adapt the feature space to a dataset
+adapt(feature_space, dataset)
Use the adapted feature_space
preprocessing layer as a layer in a Keras Model, or in the data input pipeline with tfdatasets::dataset_map()
Available features:
image_load()
: Loads an image into PIL format.
flow_images_from_data()
; flow_images_from_directory()
: Generates batches of augmented/normalized data from images and labels, or a directory.
image_data_generator()
: Generate minibatches of image data with real-time data augmentation.
fit_image_data_generator()
: Fit image data generator internal statistics to some sample data.
generator_next()
: Retrieve the next item.
image_to_array()
; image_array_resize()
; image_array_save()
: 3D array representation.
feature_float()
feature_float_rescaled()
feature_float_normalized()
feature_float_discretized()
feature_integer_categorical()
feature_integer_hashed()
feature_string_categorical()
feature_string_hashed()
feature_cross()
feature_custom()
Keras applications are deep learning models that are made available alongside pre-trained weights. These models can be used for prediction, feature extraction, and fine-tuning.
+MobileNetV3 Model, pre-trained on ImageNet
+application_mobilenet_v3_large()
application_mobilenet_v3_small()
EfficientNetV2 Model, pre-trained on ImageNet
+application_efficientnet_v2s()
application_efficientnet_v2m()
application_efficientnet_v2l()
Inception-ResNet v2 and v3 model, with weights trained on ImageNet
+application_inception_resnet_v2()
application_inception_v3()
VGG16 and VGG19 models
+application_vgg16()
application_vgg19()
ResNet50 model
application_xception()
; xception_preprocess_input()
: Xception v1 model.
application_inception_v3()
; inception_v3_preprocess_input()
: Inception v3 model, with weights pre-trained on ImageNet.
application_inception_resnet_v2()
; inception_resnet_v2_preprocess_input()
: Inception-ResNet v2 model, with weights trained on ImageNet.
application_vgg16()
; application_vgg19()
: VGG16 and VGG19 models.
application_resnet50()
: ResNet50 model.
application_mobilenet()
; mobilenet_preprocess_input()
; mobilenet_decode_predictions()
; mobilenet_load_model_hdf5()
: MobileNet model architecture.
application_resnet50()
:ImageNet is a large database of images with labels, extensively used for deep learning.
+NASNet model architecture
imagenet_preprocess_input()
; imagenet_decode_predictions()
: Preprocesses a tensor encoding a batch of images for ImageNet, and decodes predictions.application_nasnet_large()
application_nasnet_mobile()
ImageNet is a large database of images with labels, extensively used for deep learning
+Preprocesses a tensor encoding a batch of images for an application, and decodes predictions from an application. - application_preprocess_inputs()
- application_decode_predictions()
CC BY SA Posit Software, PBC • info@posit.co • posit.co
-Learn more at tensorflow.rstudio.com.
-Updated: 2024-05.
+Learn more at keras.posit.co.
+Updated: 2024-06.
packageVersion("keras")
packageVersion("keras3")
[1] '2.15.0'
+[1] '1.0.0'
Supported in Databricks Connect v2
Translates into Spark SQL statements
copy_to(sc, mtcars) %>%
-mutate(trm = ifelse(am == 0, "auto", "man")) %>%
- group_by(trm) %>%
+ copy_to(sc, mtcars) |>
+mutate(trm = ifelse(am == 0, "auto", "man")) |>
+ group_by(trm) |>
summarise_all(mean)
Supported in Databricks Connect v2
copy_to(sc, mtcars) %>%
-group_by(cyl) %>%
- summarise(mpg_m = mean(mpg)) %>% # Summarize in Spark
- collect() %>% # Collect results in R
+ copy_to(sc, mtcars) |>
+group_by(cyl) |>
+ summarise(mpg_m = mean(mpg)) |> # Summarize in Spark
+ collect() |> # Collect results in R
ggplot() +
geom_col(aes(cyl, mpg_m)) # Create plot
copy_to(sc, mtcars) %>%
+copy_to(sc, mtcars) |>
spark_apply(
# R only function
nrow, group_by = "am",
@@ -617,7 +617,7 @@ Distributed R
CC BY SA Posit Software, PBC • info@posit.co • posit.co
Learn more at spark.posit.co and therinspark.com.
-Updated: 2024-05.
+Updated: 2024-06.
packageVersion("sparklyr")
diff --git a/html/sparklyr_files/sparklyr/execute-results/html.json b/html/sparklyr_files/sparklyr/execute-results/html.json
deleted file mode 100644
index 4b8e395b..00000000
--- a/html/sparklyr_files/sparklyr/execute-results/html.json
+++ /dev/null
@@ -1,15 +0,0 @@
-{
- "hash": "319f3fddc68fd7b995e659573b518077",
- "result": {
- "engine": "knitr",
- "markdown": "---\ntitle: \"Data Science in Spark with Sparklyr :: Cheat Sheet\"\nformat: \n html:\n toc: true\n highlight-style: a11y-dark\neditor: visual\n---\n\n\n\n\n\n## Intro\n\nsparklyr is an R interface for **Apache Spark**, it provides a complete **dplyr** backend and the option to query directly using **Spark SQL** statement. With **sparklyr**, you can orchestrate distributed machine learning using either **Spark's MLlib** or **H2O** Sparkling Water. Starting with **version 1.044**, **RStudio Desktop**, **Server and Pro include integrated support for the sparklyr package**. You can create and manage connections to Spark clusters and local Spark instances from inside the IDE.\n\n### RStudio Integrates with sparklyr\n\nTODO Screenshots\n\nExpand to read about the sparklyr features in the RStudio IDE.\n\n#### Sparklyr features in the RStudio IDE\n\n- Open connection log\n- Disconnect\n- Open the Spark UI\n- Spark & Hive Tables\n- Preview 1K rows\n\n## Cluster Deployment\n\nIn a managed cluster, the driver node (RStudio, Spark, Hive) connects to the cluster manager (Yarn, Mesos) which connects to the worker nodes (Spark).\n\nIn a stand alone cluster the driver node (RStudio, Spark) connects directly to the worker nodes (Spark).\n\n## Data Science Toolchain with Spark + sparklyr\n\n1. Import\n - Export an R DataFrame\n\n - Read a file\n\n - Read existing Hive table\n2. Tidy/Wrangle\n - dplyr verb\n\n - Direct Spark SQL (DBI)\n\n - SDF function (Scala API)\n3. Understand\n - Transform - Transformer function\n\n - Visualize - Collect data into R for plotting\n\n - Model - Spark MLlib and H2O Extension\n4. Communicate\n - Collect data into R\n\n - Share plots, documents, and apps\n\n## Getting Started\n\n### Local Mode (no cluster required)\n\n1. Install a local version of Spark:\n\n\n\n\n\n ::: {.cell}\n \n ```{.r .cell-code}\n spark_install(\"2.0.1\")\n ```\n :::\n\n\n\n\n\n2. Open a connection:\n\n\n\n\n\n ::: {.cell}\n \n ```{.r .cell-code}\n sc <- spark_connect(master = \"local\")\n ```\n :::\n\n\n\n\n\n### On a Mesos Managed Cluster\n\n1. Install RStudio Server or Pro on one of the existing nodes\n\n2. Locate path to the cluster's Spark directory\n\n3. Open a connection\n\n\n\n\n\n ::: {.cell}\n \n ```{.r .cell-code}\n spark_connect(master = \"[mesos URL]\",\n version = \"1.6.2\", \n spark_home = [Cluster’s Spark path])\n ```\n :::\n\n\n\n\n\n### Using Livy (Experimental)\n\n1. The Livy REST application should be running on the cluster\n\n2. Connect to the cluster\n\n\n\n\n\n ::: {.cell}\n \n ```{.r .cell-code}\n sc <- spark_connect(method = \"livy\", \n master = \"http://host:port\")\n ```\n :::\n\n\n\n\n\n### On a Yarn Managed Cluster\n\n1. Install RStudio Server or RStudio Pro on one of the existing nodes, preferably an edge node\n\n2. Locate path to the cluster's Spark Home Directory, it normally is `/usr/lib/spark`\n\n3. Open a connection\n\n\n\n\n\n ::: {.cell}\n \n ```{.r .cell-code}\n spark_connect(master=\"yarn-client\", \n version = \"1.6.2\", \n spark_home = [Cluster’s Spark path])\n ```\n :::\n\n\n\n\n\n### On a Spark Standaline Cluster\n\n1. Install RStudio Server or RStudio Pro on one of the existing nodes or a server in the same LAN\n\n2. Install a local version of Spark:\n\n\n\n\n\n ::: {.cell}\n \n ```{.r .cell-code}\n spark_install(version = \"2.0.1\")\n ```\n :::\n\n\n\n\n\n3. Open a connection\n\n\n\n\n\n ::: {.cell}\n \n ```{.r .cell-code}\n spark_connect(master=\"spark:// host:port\",\n version = \"2.0.1\", \n spark_home = spark_home_dir())\n ```\n :::\n\n\n\n\n\n## Tuning Spark\n\n### Example Configuration\n\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nconfig <- spark_config() \nconfig$spark.executor.cores <- 2\nconfig$spark.executor.memory <- \"4G\" \nsc <- spark_connect (master=\"yarn-client\", config = config, version = \"2.0.1\")\n```\n:::\n\n\n\n\n\n### Important Tuning Parameters (with defaults)\n\n- `spark.yarn.am.cores`\n- `spark.yarn.am.memory`: 512m\n- `spark.network.timeout`: 120s\n- `spark.executor.memory`: 1g\n- `spark.executor.cores`: 1\n- `spark.executor.instances`\n- `spark.executor.extraJavaOptions`\n- `spark.executor.heartbeatInterval`: 10s\n- `sparklyr.shell.executor-memory`\n- `sparklyr.shell.driver-memory`\n\n## Using sparklyr\n\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(sparklyr)\nlibrary(dplyr)\nlibrary(ggplot2)\nlibrary(tidyr)\nset.seed(100)\n\n#Install Spark locally\nspark_install(\"2.0.1\")\n\n# Connect to local version\nsc <- spark_connect(master = \"local\")\n\n# Copy data to Spark memory\nimport_iris <- copy_to(sc, \n iris, \n \"spark_iris\", \n overwrite = TRUE)\n\n# Partition data\npartition_iris <- sdf_partition(import_iris,\n training = 0.5, \n testing = 0.5)\n\n#Create a hive metadata for each partition\n\nsdf_register(partition_iris,\n c(\"spark_iris_training\", \"spark_iris_test\"))\n \nspark_connect(master = \"[mesos URL]\", \n version = \"1.6.2\", spark_home = [Cluster’s Spark path])\n\ntidy_iris <- tbl(sc, \"spark_iris_training\") %>% \n select(Species, Petal_Length, Petal_Width)\n\n# Spark ML Decision Tree Model\nmodel_iris <- tidy_iris %>%\n ml_decision_tree(response = \"Species\",\n features = c(\"Petal_Length\", \"Petal_Width\"))\n\n# Create reference to Spark table\ntest_iris <- tbl(sc, \"spark_iris_test\")\n\n# Bring data back into R memory for plotting\npred_iris <- sdf_predict(model_iris, test_iris) %>% \n collect\n\npred_iris %>% inner_join(data.frame(prediction = 0:2, lab = model_iris$model.parameters$labels)) %>%\n ggplot(aes(Petal_Length, Petal_Width, col = lab)) + geom_point()\n\n# Disconnect\nspark_disconnect(sc)\n```\n:::\n\n\n\n\n\n\n\n## Reactivity\n\n### Copy a Data Frame Into Spark\n\n- `sdf_copy_to(sc, x, name, memory, repartition, overwrite)`\n\n\n\n\n\n ::: {.cell}\n \n ```{.r .cell-code}\n sdf_copy_to(sc, iris, \"spark_iris\")\n ```\n :::\n\n\n\n\n\n### Import Into Spark From a File\n\nArguments that apply to all functions: `sc`, `name`, `path`, `options = list()`, `repartition = 0`, `memory = TRUE`, `overwrite = TRUE`\n\n- `spark_read_csv(header = TRUE, columns = NULL, infer_schema = TRUE, delimiter = \",\", quote = \"\\\"\", escape = \"\\\\\", charset = \"UTF-8\", null_value = NULL)`\n\n- `spark_read_json()`\n\n- `spark_read_parquet()`\n\n### Spark SQL Commands\n\n- `DBI::dbWriteTable(conn, value)`\n\n\n\n\n\n ::: {.cell}\n \n ```{.r .cell-code}\n DBI::dbWriteTable(sc, \"spark_iris\", iris)\n ```\n :::\n\n\n\n\n\n### From a Table in Hive\n\n- `tbl_cache(sc, name, force = TRUE)`: Loads the table into memory\n\n\n\n\n\n ::: {.cell}\n \n ```{.r .cell-code}\n my_var <- tbl_cache(sc, name= \"hive_iris\")\n ```\n :::\n\n\n\n\n\n- `dplyr::tbl(scr, ...)`: Creates a reference to the table without loading it into memory\n\n\n\n\n\n ::: {.cell}\n \n ```{.r .cell-code}\n my_var <- dplyr::tbl(sc, name= \"hive_iris\")\n ```\n :::\n\n\n\n\n\n## Wrangle\n\n### Spark SQL via dplyer Verbs\n\n- Translates into Spark SQL statements:\n\n\n\n\n\n ::: {.cell}\n \n ```{.r .cell-code}\n my_table <- my_var %>% \n filter(Species==\"setosa\") %>% \n sample_n(10)\n ```\n :::\n\n\n\n\n\n### Direct Spark SQL Commands\n\n- `DBI::dbGetQuery(conn, statement)`\n\n\n\n\n\n ::: {.cell}\n \n ```{.r .cell-code}\n my_table <- DBI::dbGetQuery(sc, \"SELECT * FROM iris LIMIT 10\")\n ```\n :::\n\n\n\n\n\n### Scala API via SDF Functions\n\n- `sdf_mutate(.data)`: Works like dplyr mutate function\n\n- `sdf_partition(x, ..., weights = NULL, seed = sample (.Machine$integer.max, 1))`\n\n\n\n\n\n ::: {.cell}\n \n ```{.r .cell-code}\n sdf_partition(x, training = 0.5, test = 0.5) sdf_register(x, name = NULL)\n ```\n :::\n\n\n\n\n\n- `sdf_register(x, name = NULL)`: Gives a Spark DataFrame a table name\n\n- `sdf_sample(x, fraction = 1, replacement = TRUE, seed = NULL)`\n\n- `sdf_sort(x, columns)`: Sorts by \\>=1 columns in ascending order\n\n- `sdf_with_unique_id(x, id = \"id\")`\n\n- `sdf_predict(object, newdata)`: Spark DataFrame with predicted values\n\n### ML Transformers\n\nExample:\n\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nft_binarizer(my_table,\n input.col=\"Petal_Length\", \n output.col=\"petal_large\", \n threshold=1.2)\n```\n:::\n\n\n\n\n\nArguments that apply to all functions: `x`, `input.col = NULL`, `output.col = NULL`\n\n- `ft_binarizer(threshold = 0.5)`: Assigned values based on threshold\n\n- `ft_bucketizer(splits)`: Numeric column to discretized column\n\n- `ft_discrete_cosine_transform(inverse = FALSE)`: Time domain to frequency domain\n\n- `ft_elementwise_product(scaling.col)`: Element-wise product between 2 cols\n\n- `ft_index_to_string()`: Index labels back to label as strings\n\n- `ft_one_hot_encoder()`: Continuous to binary vectors\n\n- `ft_quantile_discretizer(n.buckets=5L)`: Continuous to binned categorical values\n\n- `ft_sql_transformer(sql)`\n\n- `ft_string_indexer(params = NULL)`: Column of labels into a column of label indices\n\n- `ft_vector_assembler()`: Combine vectors into single row-vector\n\n## Visulize & Communicate\n\n### Download Data to R Memory\n\nExample:\n\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nr_table <- collect(my_table) \nplot(Petal_Width~Petal_Length, \n data=r_table)\n```\n:::\n\n\n\n\n\n- `dplyr::collect(x)`: Download a Spark DataFrame to an R DataFrame\n\n- `sdf_read_column(x, column)`: Returns contents of a single column to R\n\n### Save From Spark to File System\n\nArguments that apply to all functions: `x`, `path`\n\n- `spark_read_csv( header = TRUE, delimiter = \",\", quote = \"\\\"\", escape = \"\\\\\", charset = \"UTF-8\", null_value = NULL)`\n\n- `spark_read_json(mode = NULL)`\n\n- `spark_read_parquet(mode = NULL)`\n\n## Reading & Writing from Apache Spark\n\nWrite to Spark, from R with `sdf_copy_to()`, `dplyr::copy_to()`, or `DBI::sbWriteTable()`.\n\nRead from Spark, to R with `sdf_collect()`, `dplyr::collect()`, `sdf_read_column`.\n\n------------------------------------------------------------------------\n\nWrite to Spark, from Hive with `tbl_cache()` or `dplyr::tbl()`.\n\n------------------------------------------------------------------------\n\nWrite to Spark from the file system with `spark_read_()`.\n\nRead from Spark to the file system with `spark_write_()`.\n\n## Extensions\n\nCreate an R package that calls the full Spark API & provide interfaces to Park packages.\n\n### Core Types\n\n- `spark_connection()`: Connection between R and the Spark shell process\n\n- `spark_jobj()`: Instance og a remote Spark object\n\n- `spark_dataframe()`: Instance of a remote Spark DataFrame object\n\n### Call Spark From R\n\n- `invoke()`: Call a method on a Java object\n\n- `invoke_new()`: Create a new object by invoking a constructor\n\n- `invoke_static()`: Call a static method on an object\n\n### Machine Learning Extensions\n\n- `ml_create_dummy_variables()`\n\n- `ml_prepare_dataframe()`\n\n- `ml_prepare_response_features_intercept()`\n\n- `ml_options()`\n\n- `ml_model()`\n\n## Model (MLlib)\n\nExample:\n\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nml_decision_tree(my_table, \n response = \"Species\", features = c(\"Petal_Length\" , \"Petal_Width\"))\n```\n:::\n\n\n\n\n\n- `ml_als_factorization(x, user.column = \"user\", rating.column = \"rating\", item.column = \"item\", rank = 10L, regularization.parameter = 0.1, iter.max = 10L, ml.options = ml_options())`\n\n- `ml_decision_tree(x, response, features, max.bins = 32L, max.depth = 5L, type = c(\"auto\", \"regression\", \"classification\"), ml.options = ml_options())`: Same options for: ml_gradient_boosted_trees\n\n- `ml_generalized_linear_regression(x, response, features, intercept = TRUE, family = gaussian(link = \"identity\"), iter.max = 100L, ml.options = ml_options())`\n\n- `ml_kmeans(x, centers, iter.max = 100, features = dplyr::tbl_vars(x), compute.cost = TRUE, tolerance = 1e-04, ml.options = ml_options())`\n\n- `ml_lda(x, features = dplyr::tbl_vars(x), k = length(features), alpha = (50/k) + 1, beta = 0.1 + 1, ml.options = ml_options())`\n\n- `ml_linear_regression(x, response, features, intercept = TRUE, alpha = 0, lambda = 0, iter.max = 100L, ml.options = ml_options())`: Same options for: ml_logistic_regression\n\n- `ml_multilayer_perceptron(x, response, features, layers, iter.max = 100, seed = sample(.Machine$integer.max, 1), ml.options = ml_options())`\n\n- `ml_naive_bayes(x, response, features, lambda = 0, ml.options = ml_options())`\n\n- `ml_one_vs_rest(x, classifier, response, features, ml.options = ml_options())`\n\n- `ml_pca(x, features = dplyr::tbl_vars(x), ml.options = ml_options())`\n\n- `ml_random_forest(x, response, features, max.bins = 32L, max.depth = 5L, num.trees = 20L, type = c(\"auto\", \"regression\", \"classification\"), ml.options = ml_options())`\n\n- `ml_survival_regression(x, response, features, intercept = TRUE,censor = \"censor\", iter.max = 100L, ml.options = ml_options())`\n\n- `ml_binary_classification_eval(predicted_tbl_spark, label, score, metric = \"areaUnderROC\")`\n\n- `ml_classification_eval(predicted_tbl_spark, label, predicted_lbl, metric = \"f1\")`\n\n- `ml_tree_feature_importance(sc, model)`\n",
- "supporting": [],
- "filters": [
- "rmarkdown/pagebreak.lua"
- ],
- "includes": {},
- "engineDependencies": {},
- "preserve": {},
- "postProcess": true
- }
-}
\ No newline at end of file
diff --git a/index.html b/index.html
index 6864a379..7bbe3e91 100644
--- a/index.html
+++ b/index.html
@@ -214,7 +214,7 @@ Posit Cheatsheets
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-