kerasformula_bigKRLS_Learn.Rmd

---
title: "Learn kerasformula and Keras"
output: html_notebook
---

 
**kerasformula** sample code
Compare the default dense model that ksm generates to the lstm model. 

```{r}

#devtools::install_github("rstudio/tensorflow", force = TRUE)
library(tensorflow)   # install all of the Python dependencies using a built-in function
install_tensorflow()

#use_virtualenv("~/myenv")
#use_condaenv("myenv")


##
devtools::install_github("rstudio/reticulate", force = TRUE)
library(reticulate)
reticulate::use_python("D:/Anaconda3/pkgs/python-2.7.15-he216670_0")
reticulate::py_config()
#Sys.getenv("RETICULATE_PYTHON")
Sys.setenv(RETICULATE_PYTHON = "D:/Anaconda3/pkgs/python-2.7.15-he216670_0")
Sys.setenv(PYTHONHOME = "D:/Anaconda3/pkgs/python-2.7.15-he216670_0")
##


devtools::install_github("rstudio/keras", force = TRUE)
library(keras)
install_keras(conda = "D:/Anaconda3/Scripts/conda.exe")
#install.packages("kerasformula", force = TRUE)
#devtools::install_github("rdrr1990/kerasformula")
library(kerasformula)
```

Load imdb data in library(keras)
Note: To control runtime, the number of features are limited and only a sliver of the training data is used.

```{r}
max_features <- 5000 # 5,000 words (ranked by popularity) found in movie reviews
maxlen <- 50  # Cut texts after 50 words (among top max_features most common words) 

cat('Loading data...\n')

imdb <- dataset_imdb(num_words = max_features)
imdb_df <- as.data.frame(cbind(imdb$train$y, pad_sequences(imdb$train$x)))
demo_sample <- sample(nrow(imdb_df), 1000)

```

kms's default dense model
```{r}
out_dense = kms("V1 ~ .", data = imdb_df[demo_sample, ], Nepochs = 2)
out_dense$confusion
```

Alternative versions of kms's default dense model
```{r}

out_dense <- kms("V1 ~ .", data = imdb_df[demo_sample, ], Nepochs = 10, seed=123, scale=NULL,
                 layers = list(units = c(512, 256, 128, NA), 
                               activation = c("softmax", "relu", "relu", "softmax"),
                               dropout = c(0.75, 0.4, 0.3, NA),
                               use_bias = TRUE,
                               kernel_initializer = NULL,
                               kernel_regularizer = "regularizer_l1",
                               bias_regularizer = "regularizer_l1",
                               activity_regularizer = "regularizer_l1"
                               ))
out_dense$confusion

```


lstm's dense model
```{r}
k <- keras_model_sequential()
k %>%
  layer_embedding(input_dim = max_features, output_dim = 128) %>% 
  layer_lstm(units = 64, dropout = 0.2, recurrent_dropout = 0.2) %>% 
  layer_dense(units = 1, activation = 'sigmoid')

k %>% compile(
  loss = 'binary_crossentropy',
  optimizer = 'adam',
  metrics = c('accuracy')
)

out_lstm = kms(input_formula = "V1 ~ .", data = imdb_df[demo_sample, ], keras_model_seq = k, Nepochs = 2)
out_lstm$confusion


```


**bigKRLS** sample code

```{r}
#install.packages(c("Rcpp", "RcppArmadillo", "bigmemory", "biganalytics", "snow", "shiny", "httpuv", "scales", "lazyeval", "tibble"))
#devtools::install_github('rdrr1990/bigKRLS')
library(bigKRLS)
vignette("bigKRLS_basics")
```

bigKRLS() and plotting
```{r}

mtcars[1:5,]

reg.out <- bigKRLS(y = as.matrix(mtcars$mpg), 
                   X = as.matrix(mtcars[,-1]), Ncores = 1)

summary(reg.out)

s <- reg.out$K[which(mtcars$cyl == 4), grep("Corolla", rownames(mtcars))]

barplot(s, main = "Similarity to a Toyota Corolla",
        ylab = "Kernel", sub="Toy Data from mtcars", cex.names = .7,
        col = colorRampPalette(c("red", "blue"))(length(s))[rank(s)],
        names.arg = lapply(strsplit(rownames(mtcars), split=" "),
                           function(x) x[2])[which(mtcars$cyl == 4)])
scatter.smooth(mtcars$hp, reg.out$derivatives[,3], ylab="HP's Effect", xlab="Horsepower", pch =19, bty = "n",
               main="Horsepower's Marginal Effect on Fuel Efficiency",
               sub="Toy Data from mtcars",
               col = colorRampPalette(c("blue", "red"))(nrow(mtcars))[rank(reg.out$coeffs^2)],
               ylim = c(-0.042, 0.015), xlim = c(50, 400))

```


Cross-Validation, In-sample and out-of-sample statistics, and Out-of-Sample prediction
```{r}

# Cross-Validation
CV.out <- crossvalidate.bigKRLS(y = as.matrix(mtcars$mpg), seed = 123, Kfolds = 4,
                                X = as.matrix(mtcars[,-1]), Ncores = 1)
summary(CV.out$fold_1$trained)


#In-sample and out-of-sample statistics
CV.out$MSE_is
CV.out$MSE_oos
CV.out$R2_oos
CV.out$R2AME_oos


#Predicting with Out-of-Sample Data
Xnew <- mtcars[,-1]
Xnew$hp <- 200
forecast <- predict(reg.out, as.matrix(Xnew))

mean(forecast$predicted < mtcars$mpg)

```


bigKRLS: “Big” File Management

```{r}
#If N > 2,500 or if you supply big matrices, using save() and load() will crash your R session. 
# Instead you may do one of two things to save:

#Note: X and y should only contain numeric data (no missing data, factors, or vectors of constants) 
#  and may be base R matrices or “big” matrices (from bigmemory).
#
#For example:
# y = as.matrix(mtcars$mpg)
# X <- as.matrix(mtcars[,-1]) 
out <- bigKRLS(y, X, model_subfolder_name = "my_results")  
save.bigKRLS(out, "my_results") # not run

#Either will save the model estimates to a new subfolder called “my_results” in your current working directory. 
#To re-load,

load.bigKRLS("my_results")  

#When N > 2,500 or the user provides big matrices, big matrices will be returned, 
# which are really just memory addresses.

Z <- big.matrix(nrow=5, ncol=5, init=1)
Z

## An object of class "big.matrix"
## Slot "address":
## <pointer: 0x639c570>

#You do not necessarily need to work with the big square matrices in the output. 
#  But if you do and your comfortable they fit in memory, just use the square brackets:
  
Z[]
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1 1 1 1 1
## [2,] 1 1 1 1 1
## [3,] 1 1 1 1 1
## [4,] 1 1 1 1 1
## [5,] 1 1 1 1 1

```