-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathR_Tips.Rmd
536 lines (409 loc) · 18.3 KB
/
R_Tips.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
---
title: "R Tips & Tricks"
author: "Julia Ponomarenko, CRG Bioinformatics core."
date: "Dec 27, 2017"
output:
html_document:
keep_md: true
toc: true
toc_depth: 4
theme: readable
---
<br/>
* Nice R tutorial
http://journals.plos.org/ploscompbiol/article/file?id=10.1371/journal.pcbi.1000482.s001&type=supplementary
* R swirl tutorials https://github.com/swirldev/swirl_courses. Basic one was used here.
<br/>
The example dataset used in this tutorial (Davis from the package car)
------------------
```{r, warning=FALSE, message=FALSE}
#install.packages("car")
library(car)
data <- Davis
```
<br/>
Files and Directories
--------------------------
```{r}
getwd() # current directory - This is directory where this file is located
# to set up this directory as a working in console
dir <- getwd()
setwd(dir) # run it in console, not from inside the notebook
file.info("R_Tips.Rmd")
file.info("R_Tips.Rmd")$ctime
list.files()
dir.create("testdir", showWarnings = F)
setwd("testdir") # run in console
file.create("mytest.R")
file.exists("mytest.R")
file.rename("mytest.R","mytest2.R")
file.copy("mytest2.R","mytest3.R")
list.files()
# get full paths of all files and directories containing "R" or "r" in the directory dir
filenames <- list.files(dir, pattern = "*R*", full.names = TRUE, recursive = TRUE, ignore.case = TRUE, include.dirs = TRUE)
Dir_separator <- "/" # for Windows it is "\"
file_name <- as.character(lapply(strsplit(filenames[1], Dir_separator), tail, n = 1)) # to get just the file name, for example
# while full.names = FALSE in list.files()does the same without the need to parse filenames
list.dirs() # also useful command when looping via directories is needed
dir.create(file.path('testdir2', 'testdir3'), recursive = TRUE)
setwd("..") # one directory up
unlink("testdir", recursive = T) #removes testdir and all folders inside
```
<br/>
Read files
-----------
```{r}
file <- "R_Tips.Rmd"
x <- read.table(file, skip = 7, nrows = 1) # for example, we read only line 8
x <- as.numeric(x[2]) # for example, we need a value of the 2nd element of the vector x
# let's write data to the file with the description lines on top
file <- "Davis"
conn <- file(file, open = "w")
writeLines("This is the dataset Davis from the package car", conn)
write("This is my favorite dataset", conn, append = TRUE)
close(conn)
conn <- file(file, open = "a")
write.table(data, conn, row.names = FALSE) # can be write.csv()
close(conn)
#to read a data frame
conn <- file(file, open = "r")
df <- read.table(conn, skip = 2, header = TRUE) # can be read.csv()
close(conn)
```
<br/>
Data structures: Sequences, Numbers, Vectors, Matrices, Lists
-----------
```{r}
w <<- 10 # assign w to a global environment ???
6.03 / 2
6.03 %/% 2 # integer division
6.03 %% 2 # getting remainder
my_seq <- seq(5,10,length=30)
my_seq
# all three below are identical
1:length(my_seq)
seq(along.with = my_seq)
seq_along(my_seq)
# notice difference!
rep(c(1,2,3), times = 10)
rep(c(1,2,3), each = 10)
x <- c(1,2,3,4,5,6)
y <- x > 3 # boolean vector
chars <- c("my", "name", "is")
x <- paste(chars, collapse = " ") # concatenate vector
paste("my", "name", "is", sep = " ") # same as above, but not paste(chars, sep = " ")
# Careful: paste with vectors recycles the shorter vector
paste(LETTERS, 1:4, sep = "-") # output is a vector!
# random sampling from vectors without replacement
y <- rnorm(1000)
z <- rep(NA, 1000)
my_data <- sample(c(y, z), 100)
my_na <- is.na(my_data)
sum(my_na) # how many NAs
x <- my_data
y <- x[!is.na(x)]
y[y > 0]
x[!is.na(x) & x > 0] # same as above
x[x > 0] # gives NA for NA because NA > 0 = NA
# NaN stand for "not a number"
# Inf stand for infinity
my_vector <- 1:20
dim(my_vector) <- c(4, 5) # dim allows both to get and set the 'dim' attribute to make a vector to a matrix
attributes(my_vector)
class(my_vector)
my_matrix <- my_vector
my_matrix2 <- matrix(data = 1:20, nrow = 4)
identical(my_matrix, my_matrix2)
# naming matrix rows and columns
patients <- c("Bill", "Gina", "Kelly", "Sean") # are in rows
cbind(patients,my_matrix) # numbers in my_matrix are implicitly coerced in characters!
my_data <- data.frame(patients, my_matrix) # so, use a data.frame instead
cnames <- c("patient", "age", "weight", "bp", "rating", "test")
colnames(my_data) <- cnames
# which() function finds the indices:
ints <- sample(10)
which(ints > 7)
any(ints < 0)
all(ints > 0)
```
#### Lists
```{r}
l <- list(id = "Joe", age = 25, dob = c(1980, 3, 29))
l$id
l[[1]]
l["id"] <- "Jim" #change element
l["age"] <- NULL # drop element
```
#### Operators &, &&, | and ||
```{r}
#the `&` operator to evaluate AND across a vector:
TRUE & c(TRUE, FALSE, FALSE)
#The `&&` version of AND only evaluates the first member of a vector:
TRUE && c(TRUE, FALSE, FALSE)
#The OR operator follows a similar set of rules. The `|` version of OR evaluates OR across an entire vector, while the `||` version of OR only evaluates the first member of a vector.
TRUE | c(TRUE, FALSE, FALSE)
#All AND operators are evaluated before OR operators
isTRUE(6 > 4)
xor(5 == 6, !FALSE)
```
<br/>
Functions
-----------
Type the function name to see its source code.
Use args(function_name) to see the function arguments
<br/>
#### Function can be passed as an argument to other functions
```{r}
evaluate <- function(func, dat){
return(func(dat))
}
evaluate(sd, c(1.4, 3.6, 7.9, 8.8))
evaluate(function(x){x+1}, 6)
evaluate(function(x){x[1]}, c(8, 4, 0))
evaluate(function(x){x[length(x)]}, c(8, 4, 0)) # accessing the last element
```
<br/>
#### Ellipses
Use ellipsis when the function calls the variable number of arguments, see https://www.r-bloggers.com/r-three-dots-ellipsis/
The ellipses can be used to pass on arguments to other functions that are
used within the function. Usually a function that has the
ellipses as an argument has the ellipses as the last argument. The usage of
such a function would look like
__ellipses_func(arg1, arg2 = TRUE, ...)__
<br/>
However, the usage for the paste() function is as follows:
__paste(..., sep = " ", collapse = NULL)__,
where the ellipses is the first argument, and all other arguments after the ellipses have default values. __This is a strict rule in R programming: all arguments after an ellipses must have default values. __
```{r}
telegram <- function(...){
args <- list(...)
txt <- paste(args, collapse = " ")
paste("START", txt, "STOP", collapse = " ")
}
telegram("I", "want", "to eat")
telegram2 <- function(...){
args <- list(...)
#extract named arguments from the args list by the name of the argument and double brackets:
who <- args[["who"]]
verb <- args[["verb"]]
what <- args[["what"]]
paste("The person", who, "decided that she", verb, "to do the following", what)
}
telegram2(who = "I", verb = "want", what = "to eat")
```
<br/>
Vectorization - Looping through an entire vector
--------------------------
```{r}
x <- runif(100) # a vector of uniformly distributed 100 values
y <- ifelse(x > 0.5, 1, 0)
```
<br/>
Q. Flip a biased coin [p=0.6 of heads] 100 times; how many heads do you get? Repeat this for 1000 trials.
```{r}
x <- replicate(1000, rbinom(n = 100, size = 1, p = 0.6))
heads <- colSums(x)
hist(heads)
# or the same
hist(rbinom(n = 1000, size = 100, p = 0.6))
```
<br/>
Q. Fibonacci sequence (0, 1, 1, 2, 3, 5, 8, 13, 21,...).
Wite a function that returns the n-th element in the sequence
```{r}
fibo <- function(n){
x1 <- 0
x2 <- 1
if (n == 1) return(x1)
if (n == 2) return(x2)
for (i in 3:n) {
x3 <- x1 + x2
x1 <- x2
x2 <- x3
}
return(x3)
}
system.time(o1 <- fibo(10000))
# more R-style solution using a recursive function
fi <- function(n){
if (n == 1) return(0)
if (n == 2) return(1)
fi(n-1) + fi(n-2)
}
# however, the latter function is terribly slow - compare execution time!!
system.time(o1 <- fibo(30))
system.time(o2 <- fi(30))
```
<br/>
#### lapply() and sapply() are for lists and vectors
These powerful functions, along with their close relatives vapply() and tapply(), among others, offer a concise and convenient means of implementing the Split-Apply-Combine strategy for data analysis.
The lapply() function (l stands for list) takes a list as input, applies a function to each element of the list, then returns a list of the same length as the original one. Since a data frame is just a list of vectors (you can see this with as.list(data)), we can use lapply() to apply the class() function to each column of the dataset.
```{r}
cls_list <- lapply(data, class)
cls_vect <- as.character(cls_list) # chacter vector
cls_vect <- sapply(data, class) # sapply (s stands for simplifying the output) allows to get the above in one command
#calculate mean for each column with numbers
lapply(data[,2:5], mean, na.rm = TRUE) # NA needs to be omitted
sapply(data[,2:5], mean, na.rm = TRUE) # to get a vector
sapply(data[,2:5], mean, na.rm = TRUE, simplify = F) # to get a list as in lapply
#However if lapply returns list of vectors, e.g.
lapply(data[,2:5], range, na.rm = TRUE)
# then sapply returns the matrix
sapply(data[,2:5], range, na.rm = TRUE)
# what if lapply returns vectors of different length?
unique_vals <- lapply(data, unique, na.rm = TRUE)
sapply(unique_vals, length)
# then sapply returns the list, same as lapply
sapply(data, unique, na.rm = TRUE)
# using lapply() with an anonymous function, e.g. to get the first unique elements
lapply(unique_vals, function(x) x[1])
```
<br/>
#### vapply() returns a vector explicitly, while sapply() tries to guess the output format
```{r}
vapply(data, class, character(1)) # this might be a safe option to use in a running script rather than playing with data interactively
```
<br/>
#### tapply() splits data up into groups based on the value of the 2nd listed variable, then applies a function to the members of each group
```{r}
tapply(data$weight, data$sex, summary) # summary of weights by sex
```
<br/>
#### apply() is for matrices and data frames
```{r, results="hide"}
apply(data[,2:5], 2, mean, na.rm = TRUE) # returns a vector of means by columns
apply(data[,2:5], 1, mean, na.rm = TRUE) # returns a vector of means by rows
df <- data[apply(data[,2:5], 1, sum, na.rm = TRUE) > 500,] # select rows with sum above 500
apply(data, 2, function(x) length(unique(na.omit(x)))) # count unique values in each column, omitting NA
apply(data[,2:5], 1, function(x) length(x[x < 80])) # how many values below 80 each row has
apply(data, 2, function(x) class(x))
```
<br/>
Exploring data
-------------------------
```{r}
object.size(data) # size of a dataset
summary(data)
str(data)
table(data$sex)
```
<br/>
Sampling
------------------
```{r}
sample(1:6, 4, replace = TRUE) # simulate rolling four six-sided dice
sample(LETTERS, 5) # w/o replacement
flips <- sample(c(0, 1), 100, replace = TRUE, prob = c(0.3, 0.7)) #flipping unfair coin 100 times
sum(flips)
flips2 <- rbinom(n = 100, size = 1, prob = 0.7) # same as above
sum(flips2)
rnorm(10, mean = 100, sd = 25)
my_pois <- replicate(100, rpois(5, 10)) # outputs a matrix, each column of which contains 5 random numbers generated from a Poisson distribution with lambda (or mean) of 10
cm <- colMeans(my_pois)
hist(cm) # Central limit theoreme at work!
```
<br/>
Debugging in R
-------------------
* traceback()
* Use browser() to single-step through your code. Place it within your function at the point you want to examine (e.g.) local variables.
* Use debug(function.name) to step through entire function. undebug() will remove that debug call.
* recover() is like browser, except you can choose which level to inspect, rather than the level at which browser was called.
* Add warnings and errors to your code using warning(), stop().
* Can add “assertions” into your code to check that certain values hold; e.g.,
stopifnot (x>0)
* debug(fn), undebug(fn)
* To run a script with all commands and output, in Console of RStudio type source("test.R", echo=T)
<br/>
Running R scripts in command line
-----------------
* At the command line, type “R CMD BATCH trig.R”. R will start up,
process your commands and then quit. Output will be in the file trig.Rout. If there were no errors, the last line of the output file shows the time taken to run the script.
* At the command line, "Rscript trig.R"" will print the output to the terminal
* To run the script as "./trig.R" (for example from the bash script), add "#!/usr/bin/env Rscript" at the top of the file and make "chmod 755 trig.R"
* Another way to run a script, "R < trig.R --no-save", or --save
* Any R command can be run in the command line as 'Rscript -e "getwd()"' or 'Rscript -e "install.pacjages(c("pack1", "pack2"))' , or preceed with "sudo" if it is installed not in the default location
* See Rscript --help for options. For example, --verbose can be used.
<br/>
Using GitHub from R studio
----------------------------
http://happygitwithr.com/new-github-first.html provides instruction how to setup R studio to use with git and Github.
#### “GitHub first, then RStudio” sequence for each new project
* In github browser, create a new repo, with README.
* In R Studio, start a new RStudio Project, via File > New Project > Version Control > Git, opening a new session and creating a new folder on your computer.
* Using Git tab on the right panel, do commit of all valuable changes in local git repository.
* Once ready, push local changes to GitHub, but first, to synchronize with local git repo, pull the current version from GitHub, pushing the blue button "Pull", and then the green button "Push".
<br/>
Using R markdown in github
--------
After http://happygitwithr.com/rmd-test-drive.html
The magical process that turns your R Markdown to HTML is like so: foo.Rmd --> foo.md --> foo.html. Note the intermediate markdown, foo.md. By default RStudio discards this, but you might want to hold on to that markdown.
Why? GitHub gives very special treatment to markdown files. They are rendered in an almost HTML-like way. This is great because it preserves all the charms of plain text but gives you a pseudo-webpage for free when you visit the file in the browser. In contrast, HTML is rendered as plain text on GitHub and you’ll have to take special measures to see it the way you want.
Add in output at the top of Rmd file "keep_md: true"
By making foo.Rmd available, others can see and run your actual code. By sharing foo.md and/or foo.html, others can casually browse your end product and decide if they even want to bother.
* Add figures on GitHub md-document, using for example, ```{r, CLT_50_samples, fig.path='Figures/'} and commiting the folder Figures as well (that is the whole folder with the project)
```{r, CLT_50_samples, fig.path='Figures/'}
n_sample <- 50
lambda <- 8
size <- 5
my_pois <- replicate(n_sample, rpois(size, lambda)) # outputs a matrix, each column of which contains 5 random numbers generated from a Poisson distribution with lambda (or mean) of 10
cm <- colMeans(my_pois)
title <- paste("Central limit theorem at work: \nDistribution of means of ", n_sample, " samples of size ", size, "\n from the Poisson distribution with lambda = ", lambda)
hist(cm, col = "gainsboro", ylim = c(0, 0.5), xlim = c(2,14), main = title, freq = F, breaks = 20)
lines(density(cm), lwd = 2)
curve(dnorm(x, mean = mean(cm), sd = sd(cm)), col = "red", lwd = 2, add = T)
```
<br/>
```{r, CLT_100_samples, fig.path='Figures/', echo = FALSE}
n_sample <- 100
lambda <- 8
size <- 5
my_pois <- replicate(n_sample, rpois(size, lambda)) # outputs a matrix, each column of which contains 5 random numbers generated from a Poisson distribution with lambda (or mean) of 10
cm <- colMeans(my_pois)
title <- paste("Means of ", n_sample, " samples")
hist(cm, col = "gainsboro", ylim = c(0, 0.5), xlim = c(2,14), main = title, freq = F, breaks = 20)
lines(density(cm), lwd = 2)
curve(dnorm(x, mean = mean(cm), sd = sd(cm)), col = "red", lwd = 2, add = T)
```
```{r, CLT_1000_samples, fig.path='Figures/', echo = FALSE}
n_sample <- 1000
lambda <- 8
size <- 5
my_pois <- replicate(n_sample, rpois(size, lambda)) # outputs a matrix, each column of which contains 5 random numbers generated from a Poisson distribution with lambda (or mean) of 10
cm <- colMeans(my_pois)
title <- paste("Means of ", n_sample, " samples")
hist(cm, col = "gainsboro", ylim = c(0, 0.5), xlim = c(2,14), main = title, freq = F, breaks = 20)
lines(density(cm), lwd = 2)
curve(dnorm(x, mean = mean(cm), sd = sd(cm)), col = "red", lwd = 2, add = T)
```
```{r, CLT_10000_samples, fig.path='Figures/', echo = FALSE}
n_sample <- 10000
lambda <- 8
size <- 5
my_pois <- replicate(n_sample, rpois(size, lambda)) # outputs a matrix, each column of which contains 5 random numbers generated from a Poisson distribution with lambda (or mean) of 10
cm <- colMeans(my_pois)
title <- paste("Means of ", n_sample, " samples")
hist(cm, col = "gainsboro", ylim = c(0, 0.5), xlim = c(4,12), main = title, freq = F, breaks = 20)
lines(density(cm), lwd = 2)
curve(dnorm(x, mean = mean(cm), sd = sd(cm)), col = "red", lwd = 2, add = T)
```
<br/>
Rmd files
-----------------------
http://www.stat.cmu.edu/~cshalizi/rmarkdown/
http://rmarkdown.rstudio.com/html_document_format.html#overview
<br/>
* Add a new chunk by clicking the *Insert Chunk* button on the toolbar or by pressing *Cmd+Option+I*.
* In-line code is used like this: In this tutorial, we are using the dataset with `r nrow(data)` data point and `r ncol(data)` variables.
* Name the chunk, like ```{r chunk_name}. This is useful for looking for errors and for figures that will have names based on the name of the chunk
* To pull all of the R code from the Rmd file, use purl("test.Rmd")
* Extract R code and include documentation, use purl("test.Rmd", output = "test.R", documentation = 2)
* To echo the code (but don't run it) from an external script called myscript.R in the Rmarkdown, use {r, code = readLines("myscript.R")}
* Not to show the code, but show the results, use echo = FALSE.
* Not to show the results, but show the code and the figures, use results = "hide".
* Evaluate the chunk but not to show the code and the result, use include = FALSE.
* To hide figure, use fig.show="hide".
* include=FALSE, all of the code, results, and figures will be suppressed.
* warning=FALSE and message=FALSE suppress any R warnings or messages from being included in the final document.
* fig.path='Figs/' makes it so the figure files get placed in the Figs subdirectory. (By default, they are not saved at all.)
* When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the *Preview* button or press *Cmd+Shift+K* to preview the HTML file).