From 3dd3c4051fbca03dcfd9685299edea2a6477154b Mon Sep 17 00:00:00 2001 From: wikiselev Date: Tue, 6 Aug 2019 11:50:44 +0100 Subject: [PATCH 1/4] update some files with cache setting --- course_files/Intro-TabulaMuris.Rmd | 61 ++++++++++++++------------ course_files/Intro-to-Bioconductor.Rmd | 23 ++++++---- course_files/_bookdown.yml | 25 ++++++++++- course_files/exp-methods.Rmd | 18 ++++---- course_files/exprs-constr.Rmd | 10 ++--- course_files/ggplot2-pheatmap-PCA.Rmd | 21 +++++---- course_files/intro-to-R.Rmd | 60 ++++++++++++------------- course_files/intro.Rmd | 8 ++-- course_files/process-raw-QC.Rmd | 15 ++++--- course_files/process-raw-align.Rmd | 13 ++++-- course_files/process-raw.Rmd | 42 ++++++++++-------- course_files/scater.Rmd | 8 ++-- course_files/umis.Rmd | 18 ++++---- 13 files changed, 185 insertions(+), 137 deletions(-) diff --git a/course_files/Intro-TabulaMuris.Rmd b/course_files/Intro-TabulaMuris.Rmd index b60ace615..5578938c5 100644 --- a/course_files/Intro-TabulaMuris.Rmd +++ b/course_files/Intro-TabulaMuris.Rmd @@ -2,6 +2,11 @@ output: html_document --- +```{r Intro-TabulaMuris0, echo=FALSE} +library(knitr) +opts_chunk$set(cache=TRUE, cache.extra = list(R.version, sessionInfo())) +``` + # Tabula Muris ## Introduction @@ -17,7 +22,7 @@ Unlike most single-cell RNASeq data Tabula Muris has release their data through Terminal-based download of FACS data: -```{bash message=FALSE, warning=FALSE, results='hide'} +```{bash Intro-TabulaMuris1, message=FALSE, warning=FALSE, results='hide'} wget https://ndownloader.figshare.com/files/10038307 unzip 10038307 wget https://ndownloader.figshare.com/files/10038310 @@ -27,7 +32,7 @@ mv 10039267 FACS_annotations.csv ``` Terminal-based download of 10X data: -```{bash, message=FALSE, warning=FALSE, results='hide'} +```{bash Intro-TabulaMuris2, message=FALSE, warning=FALSE, results='hide'} wget https://ndownloader.figshare.com/files/10038325 unzip 10038325 wget https://ndownloader.figshare.com/files/10038328 @@ -39,11 +44,11 @@ mv 10039264 droplet_annotation.csv Note if you download the data by hand you should unzip & rename the files as above before continuing. You should now have two folders : "FACS" and "droplet" and one annotation and metadata file for each. To inspect these files you can use the `head` to see the top few lines of the text files (Press "q" to exit): -```{bash} +```{bash Intro-TabulaMuris3} head -n 10 droplet_metadata.csv ``` You can also check the number of rows in each file using: -```{bash} +```{bash Intro-TabulaMuris4} wc -l droplet_annotation.csv ``` @@ -58,13 +63,13 @@ Droplet : 42,193 cells We can now read in the relevant count matrix from the comma-separated file. Then inspect the resulting dataframe: -```{r} +```{r Intro-TabulaMuris5} dat = read.delim("FACS/Kidney-counts.csv", sep=",", header=TRUE) dat[1:5,1:5] ``` We can see that the first column in the dataframe is the gene names, so first we move these to the rownames so we have a numeric matrix: -```{r} +```{r Intro-TabulaMuris6} dim(dat) rownames(dat) <- dat[,1] dat <- dat[,-1] @@ -72,13 +77,13 @@ dat <- dat[,-1] Since this is a Smartseq2 dataset it may contain spike-ins so lets check: -```{r} +```{r Intro-TabulaMuris7} rownames(dat)[grep("^ERCC-", rownames(dat))] ``` Now we can extract much of the metadata for this data from the column names: -```{r} +```{r Intro-TabulaMuris8} cellIDs <- colnames(dat) cell_info <- strsplit(cellIDs, "\\.") Well <- lapply(cell_info, function(x){x[1]}) @@ -88,19 +93,19 @@ Mouse <- unlist(lapply(cell_info, function(x){x[3]})) ``` We can check the distributions of each of these metadata classifications: -```{r} +```{r Intro-TabulaMuris9} summary(factor(Mouse)) ``` We can also check if any technical factors are confounded: -```{r} +```{r Intro-TabulaMuris10} table(Mouse, Plate) ``` Lastly we will read the computationally inferred cell-type annotation and match them to the cell in our expression matrix: -```{r} +```{r Intro-TabulaMuris11} ann <- read.table("FACS_annotations.csv", sep=",", header=TRUE) ann <- ann[match(cellIDs, ann[,1]),] celltype <- ann[,3] @@ -109,7 +114,7 @@ celltype <- ann[,3] ## Building a scater object To create a SingleCellExperiment object we must put together all the cell annotations into a single dataframe, since the experimental batch (pcr plate) is completely confounded with donor mouse we will only keep one of them. -```{r, message=FALSE, warning=FALSE} +```{r Intro-TabulaMuris12, message=FALSE, warning=FALSE} library("SingleCellExperiment") library("scater") cell_anns <- data.frame(mouse = Mouse, well=Well, type=celltype) @@ -118,7 +123,7 @@ sceset <- SingleCellExperiment(assays = list(counts = as.matrix(dat)), colData=c ``` Finally if the dataset contains spike-ins we a hidden variable in the SingleCellExperiment object to track them: -```{r} +```{r Intro-TabulaMuris13} isSpike(sceset, "ERCC") <- grepl("ERCC-", rownames(sceset)) ``` @@ -134,49 +139,49 @@ respectively. We will be using the "Matrix" package to store matrices in sparse-matrix format in R. -```{r} +```{r Intro-TabulaMuris14} library("Matrix") cellbarcodes <- read.table("droplet/Kidney-10X_P4_5/barcodes.tsv") genenames <- read.table("droplet/Kidney-10X_P4_5/genes.tsv") molecules <- readMM("droplet/Kidney-10X_P4_5/matrix.mtx") ``` Now we will add the appropriate row and column names. However, if you inspect the read cellbarcodes you will see that they are just the barcode sequence associated with each cell. This is a problem since each batch of 10X data uses the same pool of barcodes so if we need to combine data from multiple 10X batches the cellbarcodes will not be unique. Hence we will attach the batch ID to each cell barcode: -```{r} +```{r Intro-TabulaMuris15} head(cellbarcodes) ``` -```{r} +```{r Intro-TabulaMuris16} rownames(molecules) <- genenames[,1] colnames(molecules) <- paste("10X_P4_5", cellbarcodes[,1], sep="_") ``` Now lets get the metadata and computational annotations for this data: -```{r} +```{r Intro-TabulaMuris17} meta <- read.delim("droplet_metadata.csv", sep=",", header = TRUE) head(meta) ``` Here we can see that we need to use "10X_P4_5" to find the metadata for this batch, also note that the format of the mouse ID is different in this metadata table with hyphens instead of underscores and with the gender in the middle of the ID. From checking the methods section of the accompanying paper we know that the same 8 mice were used for both droplet and plate-based techniques. So we need to fix the mouse IDs to be consistent with those used in the FACS experiments. -```{r} +```{r Intro-TabulaMuris18} meta[meta$channel == "10X_P4_5",] mouseID <- "3_8_M" ``` Note: depending on the tissue you have been assigned you may have 10X data from mixed samples : e.g. mouse id = 3-M-5/6. You should still reformat these to be consistent but they will not match mouse ids from the FACS data which may affect your downstream analysis. If the mice weren't from an inbred strain it would be possible to assign individual cells to a specific mouse using exonic-SNPs but that is beyond the scope of this course. -```{r} +```{r Intro-TabulaMuris19} ann <- read.delim("droplet_annotation.csv", sep=",", header=TRUE) head(ann) ``` Again you will find a slight formating difference between the cellID in the annotation and the cellbarcodes which we will have to correct before matching them. -```{r} +```{r Intro-TabulaMuris20} ann[,1] <- paste(ann[,1], "-1", sep="") ann_subset <- ann[match(colnames(molecules), ann[,1]),] celltype <- ann_subset[,3] ``` Now lets build the cell-metadata dataframe: -```{r} +```{r Intro-TabulaMuris21} cell_anns <- data.frame(mouse = rep(mouseID, times=ncol(molecules)), type=celltype) rownames(cell_anns) <- colnames(molecules); ``` @@ -184,7 +189,7 @@ rownames(cell_anns) <- colnames(molecules); __Exercise__ Repeat the above for the other 10X batches for your tissue. __Answer__ -```{r, echo=FALSE, eval=TRUE} +```{r Intro-TabulaMuris22, echo=FALSE, eval=TRUE} molecules1 <- molecules cell_anns1 <- cell_anns @@ -221,13 +226,13 @@ cell_anns3 <- cell_anns Now that we have read the 10X data in multiple batches we need to combine them into a single SingleCellExperiment object. First we will check that the gene names are the same and in the same order across all batches: -```{r} +```{r Intro-TabulaMuris23} identical(rownames(molecules1), rownames(molecules2)) identical(rownames(molecules1), rownames(molecules3)) ``` Now we'll check that there aren't any repeated cellIDs: -```{r} +```{r Intro-TabulaMuris24} sum(colnames(molecules1) %in% colnames(molecules2)) sum(colnames(molecules1) %in% colnames(molecules3)) sum(colnames(molecules2) %in% colnames(molecules3)) @@ -235,7 +240,7 @@ sum(colnames(molecules2) %in% colnames(molecules3)) Everything is ok, so we can go ahead and combine them: -```{r} +```{r Intro-TabulaMuris25} all_molecules <- cbind(molecules1, molecules2, molecules3) all_cell_anns <- as.data.frame(rbind(cell_anns1, cell_anns2, cell_anns3)) all_cell_anns$batch <- rep(c("10X_P4_5", "10X_P4_6","10X_P7_5"), times = c(nrow(cell_anns1), nrow(cell_anns2), nrow(cell_anns3))) @@ -245,13 +250,13 @@ __Exercise__ How many cells are in the whole dataset? __Answer__ -```{r, echo=FALSE, eval=FALSE} +```{r Intro-TabulaMuris26, echo=FALSE, eval=FALSE} dim(all_molecules)[2] ``` Now build the SingleCellExperiment object. One of the advantages of the SingleCellExperiment class is that it is capable of storing data in normal matrix or sparse matrix format, as well as HDF5 format which allows large non-sparse matrices to be stored & accessed on disk in an efficient manner rather than loading the whole thing into RAM. -```{r} +```{r Intro-TabulaMuris27} all_molecules <- as.matrix(all_molecules) sceset <- SingleCellExperiment( assays = list(counts = as.matrix(all_molecules)), @@ -260,7 +265,7 @@ sceset <- SingleCellExperiment( ``` Since this is 10X data it will not contain spike-ins, so we just save the data: -```{r} +```{r Intro-TabulaMuris28} saveRDS(sceset, "kidney_droplet.rds") ``` diff --git a/course_files/Intro-to-Bioconductor.Rmd b/course_files/Intro-to-Bioconductor.Rmd index 7b63b42f5..a0044f393 100644 --- a/course_files/Intro-to-Bioconductor.Rmd +++ b/course_files/Intro-to-Bioconductor.Rmd @@ -2,6 +2,11 @@ output: html_document --- +```{r intro-to-Bioconductor0, echo=FALSE, cache=TRUE, cache.extra = list(R.version, sessionInfo())} +library(knitr) +opts_chunk$set(cache=TRUE, cache.extra = list(R.version, sessionInfo())) +``` + ## Data Types ### What is Tidy Data? @@ -13,12 +18,12 @@ Tidy data is a concept largely defined by Hadley Wickham [@wickham_2014]. Tidy d 3. Each value has its own cell. Here is an example of some tidy data: -```{R,eval=TRUE, echo=FALSE} +```{R intro-to-Bioconductor1,eval=TRUE, echo=FALSE} data.frame(Students=c("Mark", "Jane", "Mohammed", "Tom", "Celia"), Subject=c("Maths", "Biology", "Physics", "Maths", "Computing"), Years=c(1,2,3,2,3), Score=c(5,6,4,7,9)) ``` Here is an example of some untidy data: -```{R, eval=TRUE, echo=FALSE} +```{R intro-to-Bioconductor2, eval=TRUE, echo=FALSE} data.frame(Students=c("Matt", "Matt", "Ellie", "Ellie", "Tim", "Tim", "Louise", "Louise", "Kelly", "Kelly"), Sport=c("Tennis","Tennis", "Rugby", "Rugby","Football", "Football","Swimming","Swimming", "Running", "Running"), Category=c("Wins", "Losses", "Wins", "Losses", "Wins", "Losses", "Wins", "Losses", "Wins", "Losses"), Counts=c(0,1,3,2,1,4,2,2,5,1)) ``` @@ -28,7 +33,7 @@ Tidy data is generally easier to work with than untidy data, especially if you a The untidy data above is untidy because two variables (`Wins` and `Losses`) are stored in one column (`Category`). This is a common way in which data can be untidy. To tidy this data, we need to make `Wins` and `Losses` into columns, and store the values in `Counts` in these columns. Fortunately, there is a function from the tidyverse packages to perform this operation. The function is called `spread`, and it takes two arguments, `key` and `value`. You should pass the name of the column which contains multiple variables to `key`, and pass the name of the column which contains values from multiple variables to `value`. For example: -```{R, echo=TRUE, eval=TRUE, message=FALSE} +```{R intro-to-Bioconductor3, echo=TRUE, eval=TRUE, message=FALSE} library(tidyverse) sports<-data.frame(Students=c("Matt", "Matt", "Ellie", "Ellie", "Tim", "Tim", "Louise", "Louise", "Kelly", "Kelly"), Sport=c("Tennis","Tennis", "Rugby", "Rugby","Football", "Football","Swimming","Swimming", "Running", "Running"), Category=c("Wins", "Losses", "Wins", "Losses", "Wins", "Losses", "Wins", "Losses", "Wins", "Losses"), Counts=c(0,1,3,2,1,4,2,2,5,1)) sports @@ -38,19 +43,19 @@ spread(sports, key=Category, value=Counts) Task 2: The dataframe `foods` defined below is untidy. Work out why and use `spread()` to tidy it -```{R, echo=TRUE, eval=TRUE} +```{R intro-to-Bioconductor4, echo=TRUE, eval=TRUE} foods<-data.frame(student=c("Antoinette","Antoinette","Taylor", "Taylor", "Alexa", "Alexa"), Category=c("Dinner", "Dessert", "Dinner", "Dessert", "Dinner","Dessert"), Frequency=c(3,1,4,5,2,1)) ``` The other common way in which data can be untidy is if the columns are values instead of variables. For example, the dataframe below shows the percentages some students got in tests they did in May and June. The data is untidy because the columns `May` and `June` are values, not variables. -```{R} +```{R intro-to-Bioconductor5} percentages<-data.frame(student=c("Alejandro", "Pietro", "Jane"), "May"=c(90,12,45), "June"=c(80,30,100)) ``` Fortunately, there is a function in the tidyverse packages to deal with this problem too. `gather()` takes the names of the columns which are values, the `key` and the `value` as arguments. This time, the `key` is the name of the variable with values as column names, and the `value` is the name of the variable with values spread over multiple columns. Ie: -```{R} +```{R intro-to-Bioconductor6} gather(percentages, "May", "June", key="Month", value = "Percentage") ``` @@ -60,7 +65,7 @@ These examples don't have much to do with single-cell RNA-seq analysis, but are If you google 'rich data', you will find lots of different definitions for this term. In this course, we will use 'rich data' to mean data which is generated by combining information from multiple sources. For example, you could make rich data by creating an object in R which contains a matrix of gene expression values across the cells in your single-cell RNA-seq experiment, but also information about how the experiment was performed. Objects of the `SingleCellExperiment` class, which we will discuss below, are an example of rich data. -```{r, echo=FALSE, message=FALSE} +```{r intro-to-Bioconductor7, echo=FALSE, message=FALSE} library(knitr) library(scater) opts_chunk$set(fig.align = "center") @@ -79,7 +84,7 @@ We strongly recommend all new comers and even experienced high-throughput data a [`SingleCellExperiment`](http://bioconductor.org/packages/SingleCellExperiment) (SCE) is a S4 class for storing data from single-cell experiments. This includes specialized methods to store and retrieve spike-in information, dimensionality reduction coordinates and size factors for each cell, along with the usual metadata for genes and libraries. In practice, an object of this class can be created using its constructor: -```{r message=FALSE, warning=FALSE} +```{r intro-to-Bioconductor8, message=FALSE, warning=FALSE} library(SingleCellExperiment) counts <- matrix(rpois(100, lambda = 10), ncol=10, nrow=10) rownames(counts) <- paste("gene", 1:10, sep = "") @@ -102,7 +107,7 @@ In the `SingleCellExperiment`, users can assign arbitrary names to entries of as Each of these suggested names has an appropriate getter/setter method for convenient manipulation of the `SingleCellExperiment`. For example, we can take the (very specifically named) `counts` slot, normalise it and assign it to `normcounts` instead: -```{r} +```{r intro-to-Bioconductor9} normcounts(sce) <- log2(counts(sce) + 1) sce dim(normcounts(sce)) diff --git a/course_files/_bookdown.yml b/course_files/_bookdown.yml index da85a9e05..1faa87e4b 100644 --- a/course_files/_bookdown.yml +++ b/course_files/_bookdown.yml @@ -2,4 +2,27 @@ book_filename: "scRNA-seq-course" chapter_name: "" output_dir: website new_session: yes -rmd_files: ["index.Rmd", "intro.Rmd", "exp-methods.Rmd", "process-raw-QC.Rmd", "process-raw.Rmd", "process-raw-align.Rmd", "exprs-constr.Rmd", "umis.Rmd", "intro-to-R.Rmd", "Intro-to-Bioconductor.Rmd", "scater.Rmd", "ggplot2-pheatmap-PCA.Rmd", "Intro-TabulaMuris.Rmd", "exprs-qc.Rmd", "exprs-qc-reads.Rmd", "exprs-overview.Rmd", "exprs-overview-reads.Rmd", "confounders.Rmd", "confounders-reads.Rmd", "exprs-norm.Rmd", "exprs-norm-reads.Rmd", "remove-conf.Rmd", "remove-conf-reads.Rmd", "clust-intro.Rmd", "clustering.Rmd", "dropouts.Rmd", "pseudotime.Rmd", "imputation.Rmd", "de-intro.Rmd", "de-real.Rmd", "projection.Rmd", "search.Rmd", "seurat.Rmd", "ideal-scrnaseq-pipeline.Rmd", "advanced.Rmd", "tools.Rmd", "references.Rmd"] +rmd_files: ["index.Rmd", +"intro.Rmd", +"exp-methods.Rmd", +"process-raw-QC.Rmd", "process-raw.Rmd", "process-raw-align.Rmd", +"exprs-constr.Rmd", "umis.Rmd", +"intro-to-R.Rmd", "Intro-to-Bioconductor.Rmd", "scater.Rmd", "ggplot2-pheatmap-PCA.Rmd", +"Intro-TabulaMuris.Rmd", +"exprs-qc.Rmd", "exprs-qc-reads.Rmd", "exprs-overview.Rmd", +"exprs-overview-reads.Rmd", +"confounders.Rmd", "confounders-reads.Rmd", +"exprs-norm.Rmd", "exprs-norm-reads.Rmd", +"remove-conf.Rmd", "remove-conf-reads.Rmd", +"clust-intro.Rmd", "clustering.Rmd", +"dropouts.Rmd", +"pseudotime.Rmd", +"imputation.Rmd", +"de-intro.Rmd", "de-real.Rmd", +"projection.Rmd", +"search.Rmd", +"seurat.Rmd", +"ideal-scrnaseq-pipeline.Rmd", +"advanced.Rmd", +"tools.Rmd", +"references.Rmd"] diff --git a/course_files/exp-methods.Rmd b/course_files/exp-methods.Rmd index c2b26f6c1..37f8b8fe8 100644 --- a/course_files/exp-methods.Rmd +++ b/course_files/exp-methods.Rmd @@ -2,15 +2,15 @@ output: html_document --- -```{r, echo=FALSE} +```{r exp-methods0, echo=FALSE, cache=TRUE, cache.extra = list(R.version, sessionInfo())} library(knitr) -opts_chunk$set(fig.align = "center", echo=FALSE, out.width = '70%') +opts_chunk$set(fig.align = "center", echo=FALSE, out.width = '70%', cache=TRUE, cache.extra = list(R.version, sessionInfo())) ``` ## Experimental methods -```{r, fig.cap="Moore's law in single cell transcriptomics (image taken from [Svensson et al](https://arxiv.org/abs/1704.01379))", out.width = '100%'} -knitr::include_graphics("figures/moores-law.png") +```{r exp-methods1, fig.cap="Moore's law in single cell transcriptomics (image taken from [Svensson et al](https://arxiv.org/abs/1704.01379))", out.width = '100%'} +knitr::include_graphics("figures/moores-law.png", cache=TRUE) ``` Development of new methods and protocols for scRNA-seq is currently a very active area of research, and several protocols have been published over the last few years. A non-comprehensive list includes: @@ -33,19 +33,19 @@ For quantification, there are two types, __full-length__ and __tag-based__. The The strategy used for capture determines throughput, how the cells can be selected as well as what kind of additional information besides the sequencing that can be obtained. The three most widely used options are __microwell-__, __microfluidic-__ and __droplet-__ based. -```{r, fig.cap="Image of microwell plates (image taken from Wikipedia)"} +```{r exp-methods2, fig.cap="Image of microwell plates (image taken from Wikipedia)"} knitr::include_graphics("figures/300px-Microplates.jpg") ``` For well-based platforms, cells are isolated using for example pipette or laser capture and placed in microfluidic wells. One advantage of well-based methods is that they can be combined with fluorescent activated cell sorting (FACS), making it possible to select cells based on surface markers. This strategy is thus very useful for situations when one wants to isolate a specific subset of cells for sequencing. Another advantage is that one can take pictures of the cells. The image provides an additional modality and a particularly useful application is to identify wells containg damaged cells or doublets. The main drawback of these methods is that they are often low-throughput and the amount of work required per cell may be considerable. -```{r, fig.cap="Image of a 96-well Fluidigm C1 chip (image taken from Fluidigm)"} +```{r exp-methods3, fig.cap="Image of a 96-well Fluidigm C1 chip (image taken from Fluidigm)"} knitr::include_graphics("figures/fluidigmC1.jpg") ``` Microfluidic platforms, such as [Fluidigm's C1](https://www.fluidigm.com/products/c1-system#workflow), provide a more integrated system for capturing cells and for carrying out the reactions necessary for the library preparations. Thus, they provide a higher throughput than microwell based platforms. Typically, only around 10% of cells are captured in a microfluidic platform and thus they are not appropriate if one is dealing with rare cell-types or very small amounts of input. Moreover, the chip is relatively expensive, but since reactions can be carried out in a smaller volume money can be saved on reagents. -```{r, out.width = '60%', fig.cap="Schematic overview of the drop-seq method (Image taken from Macosko et al)"} +```{r exp-methods4, out.width = '60%', fig.cap="Schematic overview of the drop-seq method (Image taken from Macosko et al)"} knitr::include_graphics("figures/drop-seq.png") ``` @@ -59,13 +59,13 @@ Clearly, full-length transcript quantification will be more appropriate if one i Two recent studies from the Enard group [@Ziegenhain2017-cu] and the Teichmann group [@Svensson2017-op] have compared several different protocols. In their study, Ziegenhain et al compared five different protocols on the same sample of mouse embryonic stem cells (mESCs). By controlling for the number of cells as well as the sequencing depth, the authors were able to directly compare the sensitivity, noise-levels and costs of the different protocols. One example of their conclusions is illustrated in the figure below which shows the number of genes detected (for a given detection threshold) for the different methods. As you can see, there is almost a two-fold difference between drop-seq and Smart-seq2, suggesting that the choice of protocol can have a major impact on the study -```{r, out.width = '60%', fig.cap="Enard group study"} +```{r exp-methods5, out.width = '60%', fig.cap="Enard group study"} knitr::include_graphics("figures/ziegenhainEnardFig1.png") ``` Svensson et al take a different approach by using synthetic transcripts (spike-ins, more about these later) with known concentrations to measure the accuracy and sensitivity of different protocols. Comparing a wide range of studies, they also reported substantial differences between the protocols. -```{r, out.width = '100%', fig.cap="Teichmann group study"} +```{r exp-methods6, out.width = '100%', fig.cap="Teichmann group study"} knitr::include_graphics("figures/svenssonTeichmannFig2.png") ``` diff --git a/course_files/exprs-constr.Rmd b/course_files/exprs-constr.Rmd index 7579dd46f..047f8459b 100644 --- a/course_files/exprs-constr.Rmd +++ b/course_files/exprs-constr.Rmd @@ -4,9 +4,9 @@ output: html_document # Construction of expression matrix -```{r, echo=FALSE} +```{r exprs-constr0, echo=FALSE, cache=TRUE, cache.extra = list(R.version, sessionInfo())} library(knitr) -opts_chunk$set(fig.align = "center", echo=FALSE) +opts_chunk$set(fig.align = "center", echo=FALSE, cache=TRUE, cache.extra = list(R.version, sessionInfo())) ``` Many analyses of scRNA-seq data take as their starting point an __expression matrix__. By convention, the each row of the expression matrix represents a gene and each column represents a cell (although some authors use the transpose). Each entry represents the expression level of a particular gene in a given cell. The units by which the expression is meassured depends on the protocol and the normalization strategy used. @@ -22,7 +22,7 @@ $/fastQC experiment.bam Below is an example of the output from FastQC for a dataset of 125 bp reads. The plot reveals a technical error which resulted in a couple of bases failing to be read correctly in the centre of the read. However, since the rest of the read was of high quality this error will most likely have a negligible effect on mapping efficiency. -```{r exprs-constr-fastqc, out.width = '90%', fig.cap="Example of FastQC output"} +```{r exprs-constr1, out.width = '90%', fig.cap="Example of FastQC output"} knitr::include_graphics("figures/per_base_quality.png") ``` @@ -73,7 +73,7 @@ analysis. The two yellow arrows point to cells with a surprisingly large number of unmapped reads. In this example we kept the cells during the alignment QC step, but they were later removed during cell QC due to a high proportion of ribosomal RNA reads. -```{r exprs-constr-total-num-cells, out.width = '90%', fig.cap="Example of the total number of reads mapped to each cell."} +```{r exprs-constr2, out.width = '90%', fig.cap="Example of the total number of reads mapped to each cell."} knitr::include_graphics("figures/Bergiers_exp1_mapping_by_cell.png") ``` @@ -89,7 +89,7 @@ python /split_bam.py -i input.bam -r rRNAmask.bed -o output.txt However the expected results will depend on the experimental protocol, e.g. many scRNA-seq methods use poly-A selection to avoid sequencing rRNAs which results in a 3' bias in the read coverage across the genes (aka gene body coverage). The figure below shows this 3' bias as well as three cells which were outliers and removed from the dataset: -```{r exprs-constr-3-bias, out.width = '90%', fig.cap="Example of the 3' bias in the read coverage."} +```{r exprs-constr3, out.width = '90%', fig.cap="Example of the 3' bias in the read coverage."} knitr::include_graphics("figures/Exp1_RSEQC_geneBodyCoverage_plot_Combined.png") ``` diff --git a/course_files/ggplot2-pheatmap-PCA.Rmd b/course_files/ggplot2-pheatmap-PCA.Rmd index 9d9a2eed9..960653837 100644 --- a/course_files/ggplot2-pheatmap-PCA.Rmd +++ b/course_files/ggplot2-pheatmap-PCA.Rmd @@ -2,6 +2,11 @@ output: html_document --- +```{r ggplot2-pheatmap-PCA0, echo=FALSE} +library(knitr) +opts_chunk$set(cache=TRUE, cache.extra = list(R.version, sessionInfo())) +``` + ## An Introduction to ggplot2 ### What is ggplot2? @@ -19,7 +24,7 @@ ggplot2 is an R package designed by Hadley Wickham which facilitates data plotti The `aes` function specifies how variables in your dataframe map to features on your plot. To understand how this works, let's look at an example: -```{R, eval=TRUE, message=FALSE} +```{R ggplot2-pheatmap-PCA1, eval=TRUE, message=FALSE} library(ggplot2) library(tidyverse) set.seed(1) @@ -43,7 +48,7 @@ We can use geoms to specify how we would like data to be displayed on our graphs Let's see how our graph would look as a scatterplot. -```{R, eval=TRUE} +```{R ggplot2-pheatmap-PCA2, eval=TRUE} ggplot(data = counts, mapping = aes(x = cell1, y = cell2)) + geom_point() ``` @@ -57,14 +62,14 @@ So far we've been considering the gene counts from 2 of the cells in our datafra At the moment we can't do this because we are treating each individual cell as a variable and assigning that variable to either the x or the y axis. We could create a 10 dimensional graph to plot data from all 10 cells on, but this is a) not possible to do with ggplot and b) not very easy to interpret. What we could do instead is to tidy our data so that we had one variable representing cell ID and another variable representing gene counts, and plot those against each other. In code, this would look like: -```{R, eval=TRUE} +```{R ggplot2-pheatmap-PCA3, eval=TRUE} counts<-gather(counts, colnames(counts)[2:11], key = 'Cell_ID', value='Counts') head(counts) ``` Essentially, the problem before was that our data was not tidy because one variable (Cell_ID) was spread over multiple columns. Now that we've fixed this problem, it is much easier for us to plot data from all 10 cells on one graph. -```{R, eval=TRUE} +```{R ggplot2-pheatmap-PCA4, eval=TRUE} ggplot(counts,aes(x=Cell_ID, y=Counts)) + geom_boxplot() ``` @@ -76,7 +81,7 @@ Task 4: Use the updated `counts` dataframe to plot a scatterplot with Gene_ids a A common method for visualising gene expression data is with a heatmap. Here we will use the R package `pheatmap` to perform this analysis with some gene expression data we will name `test`. -```{R, eval=TRUE, message=FALSE} +```{R ggplot2-pheatmap-PCA5, eval=TRUE, message=FALSE} library(pheatmap) set.seed(2) test = matrix(rnorm(200), 20, 10) @@ -94,7 +99,7 @@ This plot also gives us information on the results of a clustering algorithm. In If we look closely at the trees, we can see that eventually they have the same number of branches as there are cells and genes. In other words, the total number of cell clusters is the same as the total number of cells, and the total number of gene clusters is the same as the total number of genes. Clearly, this is not very informative, and will become impractical when we are looking at more than 10 cells and 20 genes. Fortunately, we can set the number of clusters we see on the plot. Let's try setting the number of gene clusters to 2: -```{R, eval=TRUE} +```{R ggplot2-pheatmap-PCA6, eval=TRUE} pheatmap(test, kmeans_k = 2) ``` @@ -110,7 +115,7 @@ PCA plots are a good way to get an overview of your data, and can sometimes help Let's make a PCA plot for our `test` data. We can use the `ggfortify` package to let ggplot know how to interpret principle components. -```{R,eval=TRUE, message=FALSE} +```{R ggplot2-pheatmap-PCA7,eval=TRUE, message=FALSE} library(ggfortify) Principal_Components<-prcomp(test) autoplot(Principal_Components, label=TRUE) @@ -120,7 +125,7 @@ Task 6: Compare your clusters to the pheatmap clusters. Are they related? (Hint: Task 7: Produce a heatmap and PCA plot for `counts` (below): -```{R,eval=TRUE} +```{R ggplot2-pheatmap-PCA8,eval=TRUE} set.seed(1) counts <- as.data.frame(matrix(rpois(100, lambda = 10), ncol=10, nrow=10)) rownames(counts) <- paste("gene", 1:10, sep = "") diff --git a/course_files/intro-to-R.Rmd b/course_files/intro-to-R.Rmd index ca95cb996..a78e76ca4 100644 --- a/course_files/intro-to-R.Rmd +++ b/course_files/intro-to-R.Rmd @@ -2,9 +2,9 @@ output: html_document --- -```{r, echo=FALSE} +```{r intro-to-R0, echo=FALSE} library(knitr) -opts_chunk$set(out.width='90%', fig.align = 'center') +opts_chunk$set(out.width='90%', fig.align = 'center', cache=TRUE, cache.extra = list(R.version, sessionInfo())) ``` # Introduction to R/Bioconductor @@ -15,7 +15,7 @@ opts_chunk$set(out.width='90%', fig.align = 'center') The Comprehensive R Archive Network [CRAN](https://cran.r-project.org/) is the biggest archive of R packages. There are few requirements for uploading packages besides building and installing succesfully, hence documentation and support is often minimal and figuring how to use these packages can be a challenge it itself. CRAN is the default repository R will search to find packages to install: -```{r, eval=FALSE} +```{r intro-to-R1, eval=FALSE} install.packages("devtools") require("devtools") ``` @@ -24,13 +24,13 @@ require("devtools") [Github](https://github.com/) isn't specific to R, any code of any type in any state can be uploaded. There is no guarantee a package uploaded to github will even install, nevermind do what it claims to do. R packages can be downloaded and installed directly from github using the "devtools" package installed above. -```{r, eval=FALSE} +```{r intro-to-R2, eval=FALSE} devtools::install_github("tallulandrews/M3Drop") ``` Github is also a version control system which stores multiple versions of any package. By default the most recent "master" version of the package is installed. If you want an older version or the development branch this can be specified using the "ref" parameter: -```{r, eval=FALSE} +```{r intro-to-R3, eval=FALSE} # different branch devtools::install_github("tallulandrews/M3D", ref="nbumi") # previous commit @@ -42,7 +42,7 @@ Note: make sure you re-install the M3Drop master branch for later in the course. Bioconductor is a repository of R-packages specifically for biological analyses. It has the strictest requirements for submission, including installation on every platform and full documentation with a tutorial (called a vignette) explaining how the package should be used. Bioconductor also encourages utilization of standard data structures/classes and coding style/naming conventions, so that, in theory, packages and analyses can be combined into large pipelines or workflows. -```{r, eval=FALSE} +```{r intro-to-R4, eval=FALSE} source("https://bioconductor.org/biocLite.R") biocLite("edgeR") ``` @@ -51,7 +51,7 @@ Note: in some situations it is necessary to substitute "http://" for "https://" Bioconductor also requires creators to support their packages and has a regular 6-month release schedule. Make sure you are using the most recent release of bioconductor before trying to install packages for the course. -```{r, eval=FALSE} +```{r intro-to-R5, eval=FALSE} source("https://bioconductor.org/biocLite.R") biocLite("BiocUpgrade") ``` @@ -59,7 +59,7 @@ biocLite("BiocUpgrade") ### Source The final way to install packages is directly from source. In this case you have to download a fully built source code file, usually packagename.tar.gz, or clone the github repository and rebuild the package yourself. Generally this will only be done if you want to edit a package yourself, or if for some reason the former methods have failed. -```{r, eval=FALSE} +```{r intro-to-R6, eval=FALSE} install.packages("M3Drop_3.05.00.tar.gz", type="source") ``` @@ -76,7 +76,7 @@ Aside: R can also store data as "complex" for complex numbers but generally this The "numeric" class is the default class for storing any numeric data - integers, decimal numbers, numbers in scientific notation, etc... -```{r} +```{r intro-to-R7} x = 1.141 class(x) y = 42 @@ -87,7 +87,7 @@ class(z) Here we see that even though R has an "integer" class and 42 could be stored more efficiently as an integer the default is to store it as "numeric". If we want 42 to be stored as an integer we must "coerce" it to that class: -```{r} +```{r intro-to-R8} y = as.integer(42) class(y) ``` @@ -95,7 +95,7 @@ class(y) Coercion will force R to store data as a particular class, if our data is incompatible with that class it will still do it but the data will be converted to NAs: -```{r} +```{r intro-to-R9} as.numeric("H") ``` @@ -104,7 +104,7 @@ Above we tried to coerce "character" data, identified by the double quotation ma ### Character/String The "character" class stores all kinds of text data. Programing convention calls data containing multiple letters a "string", thus most R functions which act on character data will refer to the data as "strings" and will often have "str" or "string" in it's name. Strings are identified by being flanked by double quotation marks, whereas variable/function names are not: -```{r} +```{r intro-to-R10} x = 5 a = "x" # character "x" @@ -115,7 +115,7 @@ b ``` In addition to standard alphanumeric characters, strings can also store various special characters. Special characters are identified using a backlash followed by a single character, the most relevant are the special character for tab : `\t` and new line : `\n`. To demonstrate the these special characters lets concatenate (cat) together two strings with these characters separating (sep) them: -```{r} +```{r intro-to-R11} cat("Hello", "World", sep= " ") cat("Hello", "World", sep= "\t") @@ -124,7 +124,7 @@ cat("Hello", "World", sep= "\n") ``` Note that special characters work differently in different functions. For instance the `paste` function does the same thing as `cat` but does not recognize special characters. -```{r} +```{r intro-to-R12} paste("Hello", "World", sep= " ") paste("Hello", "World", sep= "\t") @@ -134,18 +134,18 @@ paste("Hello", "World", sep= "\n") Single or double backslash is also used as an `escape` character to turn off special characters or allow quotation marks to be included in strings: -```{r} +```{r intro-to-R13} cat("This \"string\" contains quotation marks.") ``` Special characters are generally only used in pattern matching, and reading/writing data to files. For instance this is how you would read a tab-separated file into R. -```{r, eval=FALSE} +```{r intro-to-R14, eval=FALSE} dat = read.delim("file.tsv", sep="\t") ``` Another special type of character data are colours. Colours can be specified in three main ways: by name from those [available](http://bxhorn.com/r-color-tables/), by red, green, blue values using the `rgb` function, and by hue (colour), saturation (colour vs white) and value (colour/white vs black) using the `hsv` function. By default rgb and hsv expect three values in 0-1 with an optional fourth value for transparency. Alternatively, sets of predetermined colours with useful properties can be loaded from many different packages with [RColorBrewer](http://colorbrewer2.org/) being one of the most popular. -```{r} +```{r intro-to-R15} reds = c("red", rgb(1,0,0), hsv(0, 1, 1)) reds barplot(c(1,1,1), col=reds, names=c("by_name", "by_rgb", "by_hsv")) @@ -155,7 +155,7 @@ barplot(c(1,1,1), col=reds, names=c("by_name", "by_rgb", "by_hsv")) The `logical` class stores boolean truth values, i.e. TRUE and FALSE. It is used for storing the results of logical operations and conditional statements will be coerced to this class. Most other data-types can be coerced to boolean without triggering (or "throwing") error messages, which may cause unexpected behaviour. -```{r} +```{r intro-to-R16} x = TRUE class(x) @@ -182,7 +182,7 @@ Do you ever throw a warning/error message? String/Character data is very memory inefficient to store, each letter generally requires the same amount of memory as any integer. Thus when storing a vector of strings with repeated elements it is more efficient assign each element to an integer and store the vector as integers and an additional string-to-integer association table. Thus, by default R will read in text columns of a data table as factors. -```{r} +```{r intro-to-R17} str_vector = c("Apple", "Apple", "Banana", "Banana", "Banana", "Carrot", "Carrot", "Apple", "Banana") factored_vector = factor(str_vector) factored_vector @@ -191,12 +191,12 @@ as.numeric(factored_vector) The double nature of factors can cause some unintuitive behaviour. E.g. joining two factors together will convert them to the numeric form and the original strings will be lost. -```{r} +```{r intro-to-R18} c(factored_vector, factored_vector) ``` Likewise if due to formatting issues numeric data is mistakenly interpretted as strings, then you must convert the factor back to strings before coercing to numeric values: -```{r} +```{r intro-to-R19} x = c("20", "25", "23", "38", "20", "40", "25", "30") x = factor(x) as.numeric(x) @@ -205,13 +205,13 @@ as.numeric(as.character(x)) To make R read text as character data instead of factors set the environment option `stringsAsFactors=FALSE`. This must be done at the start of each R session. -```{r} +```{r intro-to-R20} options(stringsAsFactors=FALSE) ``` __Exercise__ How would you use factors to create a vector of colours for an arbitrarily long vector of fruits like `str_vector` above? __Answer__ -```{r, include=FALSE} +```{r intro-to-R21, include=FALSE} long_str_vector = c(str_vector, str_vector, str_vector) fruit_cols = c("red", "yellow", "orange") fruit_colour_vec = fruit_cols[as.numeric(factor(long_str_vector, levels=c("Apple", "Banana", "Carrot")))] @@ -220,7 +220,7 @@ fruit_colour_vec = fruit_cols[as.numeric(factor(long_str_vector, levels=c("Apple ### Checking class/type We recommend checking your data is of the correct class after reading from files: -```{r} +```{r intro-to-R22} x = 1.4 is.numeric(x) is.character(x) @@ -232,7 +232,7 @@ is.factor(x) ## Basic data structures So far we have only looked at single values and vectors. Vectors are the simplest data structure in R. They are a 1-dimensional array of data all of the same type. If the input when creating a vector is of different types it will be coerced to the data-type that is most consistent with the data. -```{r} +```{r intro-to-R23} x = c("Hello", 5, TRUE) x class(x) @@ -242,7 +242,7 @@ Here we tried to put character, numeric and logical data into a single vector so A `matrix` is the two dimensional version of a vector, it also requires all data to be of the same type. If we combine a character vector and a numeric vector into a matrix, all the data will be coerced to characters: -```{r} +```{r intro-to-R24} x = c("A", "B", "C") y = c(1, 2, 3) class(x) @@ -253,7 +253,7 @@ m The quotation marks indicate that the numeric vector has been coerced to characters. Alternatively, to store data with columns of different data-types we can use a dataframe. -```{r} +```{r intro-to-R25} z = data.frame(x, y) z class(z[,1]) @@ -262,7 +262,7 @@ class(z[,2]) If you have set stringsAsFactors=FALSE as above you will find the first column remains characters, otherwise it will be automatically converted to a factor. -```{r} +```{r intro-to-R26} options(stringsAsFactors=TRUE) z = data.frame(x, y) class(z[,1]) @@ -270,14 +270,14 @@ class(z[,1]) Another difference between matrices and dataframes is the ability to select columns using the `$` operator: -```{r, eval=FALSE} +```{r intro-to-R27, eval=FALSE} m$x # throws an error z$x # ok ``` The final basic data structure is the `list`. Lists allow data of different types and different lengths to be stored in a single object. Each element of a list can be any other R object : data of any type, any data structure, even other lists or functions. -```{r} +```{r intro-to-R28} l = list(m, z) ll = list(sublist=l, a_matrix=m, numeric_value=42, this_string="Hello World", even_a_function=cbind) ll diff --git a/course_files/intro.Rmd b/course_files/intro.Rmd index 751f1e67b..093fecdc7 100644 --- a/course_files/intro.Rmd +++ b/course_files/intro.Rmd @@ -4,9 +4,9 @@ output: html_document # Introduction to single-cell RNA-seq -```{r, echo=FALSE} +```{r intro0, echo=FALSE, cache=TRUE, cache.extra = list(R.version, sessionInfo())} library(knitr) -opts_chunk$set(fig.align = "center", echo=FALSE) +opts_chunk$set(fig.align = "center", echo=FALSE, cache=TRUE, cache.extra = list(R.version, sessionInfo())) ``` ## Bulk RNA-seq @@ -32,7 +32,7 @@ opts_chunk$set(fig.align = "center", echo=FALSE) ## Workflow -```{r intro-rna-seq-workflow, out.width = '90%', fig.cap="Single cell sequencing (taken from Wikipedia)"} +```{r intro1, out.width = '90%', fig.cap="Single cell sequencing (taken from Wikipedia)"} knitr::include_graphics("figures/RNA-Seq_workflow-5.pdf.jpg") ``` @@ -43,7 +43,7 @@ Overall, experimental scRNA-seq protocols are similar to the methods used for bu This course is concerned with the computational analysis of the data obtained from scRNA-seq experiments. The first steps (yellow) are general for any highthroughput sequencing data. Later steps (orange) require a mix of existing RNASeq analysis methods and novel methods to address the technical difference of scRNASeq. Finally the biological interpretation (blue) __should__ be analyzed with methods specifically developed for scRNASeq. -```{r intro-flowchart, out.width = '65%', fig.cap="Flowchart of the scRNA-seq analysis"} +```{r intro2, out.width = '65%', fig.cap="Flowchart of the scRNA-seq analysis"} knitr::include_graphics("figures/flowchart.png") ``` diff --git a/course_files/process-raw-QC.Rmd b/course_files/process-raw-QC.Rmd index 1b639a417..329b81cbc 100644 --- a/course_files/process-raw-QC.Rmd +++ b/course_files/process-raw-QC.Rmd @@ -3,8 +3,9 @@ output: html_document code_folding: hide --- -```{r include=FALSE} -library('bookdown') +```{r process-raw-QC0, include=FALSE, cache=TRUE, cache.extra = list(R.version, sessionInfo())} +library(knitr) +opts_chunk$set(cache=TRUE, cache.extra = list(R.version, sessionInfo())) ``` # Processing Raw scRNA-seq Data @@ -24,14 +25,14 @@ Today we will be performing our analysis using a single cell from an mESC datase __Note__ The current text of the course is written for an AWS server for people who attend our course in person. You will have to download the files (both `ERR522959_1.fastq` and `ERR522959_2.fastq`) and create `Share` directory yourself to run the commands. You can find the files here: https://www.ebi.ac.uk/arrayexpress/experiments/E-MTAB-2600/samples/ Now let's look at the files: -```{bash, eval=FALSE} +```{bash process-raw-QC1, eval=FALSE} less Share/ERR522959_1.fastq less Share/ERR522959_2.fastq ``` Task 1: Try to work out what command you should use to produce the FastQC report. Hint: Try executing -```{bash, eval=FALSE, collapse=TRUE} +```{bash process-raw-QC2, eval=FALSE, collapse=TRUE} fastqc -h ``` @@ -42,7 +43,7 @@ This command will tell you what options are available to pass to FastQC. Feel fr If you haven't done so already, generate the FastQC report using the commands below: -```{bash, eval=FALSE, echo = TRUE} +```{bash process-raw-QC3, eval=FALSE, echo = TRUE} mkdir fastqc_results fastqc -o fastqc_results Share/ERR522959_1.fastq Share/ERR522959_2.fastq ``` @@ -65,7 +66,7 @@ Now let's try to use Trim Galore! to remove those problematic adapters. It's a g Task 3: Work out the command you should use to trim the adapters from our data. Hint 1: You can use -```{bash, eval=FALSE} +```{bash process-raw-QC4, eval=FALSE} trim_galore -h ``` @@ -80,7 +81,7 @@ Once you think you have successfully trimmed your reads and have confirmed this You can use the command(s) below to trim the Nextera sequencing adapters: -```{bash, eval=FALSE} +```{bash process-raw-QC5, eval=FALSE} mkdir fastqc_trimmed_results trim_galore --nextera -o fastqc_trimmed_results Share/ERR522959_1.fastq Share/ERR522959_2.fastq ``` diff --git a/course_files/process-raw-align.Rmd b/course_files/process-raw-align.Rmd index 79db2eafe..16f3d1e48 100644 --- a/course_files/process-raw-align.Rmd +++ b/course_files/process-raw-align.Rmd @@ -2,6 +2,11 @@ output: html_document --- +```{r process-raw-align0, include=FALSE, cache=TRUE, cache.extra = list(R.version, sessionInfo())} +library(knitr) +opts_chunk$set(cache=TRUE, cache.extra = list(R.version, sessionInfo())) +``` + ## Using STAR to Align Reads Now we have trimmed our reads and established that they are of good quality, we would like to map them to a reference genome. This process is known as alignment. Some form of alignment is generally required if we want to quantify gene expression or find genes which are differentially expressed between samples. @@ -17,7 +22,7 @@ Two steps are required to perform STAR alignment. In the first step, the user pr Let's create the index now. Remember, for reasons of time we are aligning to a transcriptome rather than a genome today, meaning we only need to provide STAR with the sequences of the transcripts we will be aligning reads to. You can obtain transcriptomes for many model organisms from Ensembl (https://www.ensembl.org/info/data/ftp/index.html). Task 1: Execute the commands below to create the index: -```{bash, eval=FALSE} +```{bash process-raw-align1, eval=FALSE} mkdir indices mkdir indices/STAR STAR --runThreadN 4 --runMode genomeGenerate --genomeDir indices/STAR --genomeFastaFiles Share/2000_reference.transcripts.fa @@ -36,7 +41,7 @@ Task 5: Try to understand the output of your alignment. Talk to one of the instr ### Solution for STAR Alignment You can use the folowing commands to perform the mapping step: -```{bash,eval=FALSE} +```{bash process-raw-align2,eval=FALSE} mkdir results mkdir results/STAR @@ -78,7 +83,7 @@ As for STAR, you will need to produce an index for Kallisto before the pseudo-al Task 6: Use the below command to produce the Kallisto index. Use the Kallisto manual (https://pachterlab.github.io/kallisto/manual) to work out what the options do in this command. -```{bash, eval=FALSE} +```{bash process-raw-align3, eval=FALSE} mkdir indices/Kallisto kallisto index -i indices/Kallisto/transcripts.idx Share/2000_reference.transcripts.fa ``` @@ -89,7 +94,7 @@ Task 7: Use the Kallisto manual to work out what command to use to perform pseud Use the below command to perform pseudo-alignment -```{bash, eval=FALSE} +```{bash process-raw-align4, eval=FALSE} mkdir results/Kallisto kallisto pseudo -i indices/Kallisto/transcripts.idx -o results/Kallisto -b batch.txt ``` diff --git a/course_files/process-raw.Rmd b/course_files/process-raw.Rmd index 609c2d374..97fd83c70 100644 --- a/course_files/process-raw.Rmd +++ b/course_files/process-raw.Rmd @@ -2,6 +2,10 @@ output: html_document --- +```{r process-raw0, include=FALSE, cache=TRUE, cache.extra = list(R.version, sessionInfo())} +library(knitr) +opts_chunk$set(cache=TRUE, cache.extra = list(R.version, sessionInfo())) +``` ## File formats @@ -15,7 +19,7 @@ Thus reads will be mapped as if they are single-end sequenced despite actually being paired end. FastQ files have the format: -```{eval=FALSE} +```{bash process-raw1, eval=FALSE} >ReadID READ SEQUENCE + @@ -55,7 +59,7 @@ Alignment rows employ a standard format with the following columns: BAM/SAM files can be converted to the other format using 'samtools': -```{bash, eval=FALSE} +```{bash process-raw2, eval=FALSE} samtools view -S -b file.sam > file.bam samtools view -h file.bam > file.sam ``` @@ -73,7 +77,7 @@ multi-mapping reads first sort by read name and remove secondary alignments using samtools. [Picard](https://broadinstitute.github.io/picard/index.html) also contains a method for converting BAM to FastQ files. -```{bash, eval=FALSE} +```{bash process-raw3, eval=FALSE} # sort reads by name samtools sort -n original.bam -o sorted_by_name.bam # remove secondary alignments @@ -95,7 +99,7 @@ Alternatively, you may pre-download the correct reference either from metadata i of the CRAM file, or from talking to whomever generated the CRAM and specify that file using '-T' Thus we recommend setting a specific cache location prior to doing this: -```{bash, eval=FALSE} +```{bash process-raw4, eval=FALSE} export REF_CACHE=/path_to/cache_directory_for_reference_genome samtools view -b -h -T reference_genome.fasta file.cram -o file.bam samtools view -C -h -T reference_genome.fasta file.bam -o file.cram @@ -108,7 +112,7 @@ are from the correct sample. 'less' and 'more' can be used to inspect any text f By "pipe-ing" the output of samtools view into these commands using '|' we check each of these file types without having to save multiple copies of each file. -```{bash, eval=FALSE} +```{bash process-raw5, eval=FALSE} less file.txt more file.txt # counts the number of lines in file.txt @@ -133,7 +137,7 @@ by entering running the command "naked" - e.g. 'samtools view', 'bedtools' __Answer__ -```{bash, eval=TRUE, include=FALSE} +```{bash process-raw6, eval=TRUE, include=FALSE} samtools view -T data/2000_reference.transcripts.fa -H data/EXAMPLE.cram | more samtools view -T data/2000_reference.transcripts.fa -f 4 data/EXAMPLE.cram | wc -l # unmapped samtools view -T data/2000_reference.transcripts.fa -F 260 data/EXAMPLE.cram | wc -l # mapped @@ -184,7 +188,7 @@ for any non-standard sequences added. There is no standardized way to do this. So below is our custom perl script for creating a gtf and fasta file for ERCCs which can be appended to the genome. You may also need to alter a gtf file to deal with repetitive elements in introns when/if you want to quantify intronic reads. Any scripting language or even 'awk' and/or some text editors can be used to do this relatively efficiently, but they are beyond the scope of this course. -```{bash, eval=FALSE} +```{bash process-raw7, eval=FALSE} # Converts the Annotation file from # https://www.thermofisher.com/order/catalog/product/4456740 to # gtf and fasta files that can be added to existing genome fasta & gtf files. @@ -240,11 +244,11 @@ the expected barcodes and assign the associated reads to the closest cell-barcod We have [publicly available](https://github.com/tallulandrews/scRNASeqPipeline) perl scripts capable of demultiplexing any scRNASeq data with a single cell-barcode with or without UMIs for plate-based protocols. These can be used as so: -```{bash, eval=TRUE} +```{bash process-raw8, eval=TRUE} perl utils/1_Flexible_UMI_Demultiplexing.pl data/10cells_read1.fq data/10cells_read2.fq "C12U8" data/10cells_barcodes.txt 2 Ex ``` -```{bash, eval=TRUE} +```{bash process-raw9, eval=TRUE} perl utils/1_Flexible_FullTranscript_Demultiplexing.pl data/10cells_read1.fq data/10cells_read2.fq "start" 12 data/10cells_barcodes.txt 2 Ex ``` @@ -267,7 +271,7 @@ barcode and try to find a "break point" between bigger libraries which are cells + some background and smaller libraries assumed to be purely background. Let's load some example simulated data which contain both large and small cells: -```{r} +```{r process-raw10} umi_per_barcode <- read.table("data/droplet_id_example_per_barcode.txt.gz") truth <- read.delim("data/droplet_id_example_truth.gz", sep=",") ``` @@ -278,7 +282,7 @@ To simplify calculations for this section exclude all barcodes with fewer than 10 total molecules. __Answer__ -```{r, include=FALSE} +```{r process-raw11, include=FALSE} dim(umi_per_barcode)[1] dim(truth)[1] umi_per_barcode <- umi_per_barcode[umi_per_barcode[,2] > 10,] @@ -287,7 +291,7 @@ umi_per_barcode <- umi_per_barcode[umi_per_barcode[,2] > 10,] One approach is to look for the inflection point where the total molecules per barcode suddenly drops: -```{r} +```{r process-raw12} barcode_rank <- rank(-umi_per_barcode[,2]) plot(barcode_rank, umi_per_barcode[,2], xlim=c(1,8000)) ``` @@ -295,7 +299,7 @@ plot(barcode_rank, umi_per_barcode[,2], xlim=c(1,8000)) Here we can see an roughly exponential curve of library sizes, so to make things simpler lets log-transform them. -```{r} +```{r process-raw13} log_lib_size <- log10(umi_per_barcode[,2]) plot(barcode_rank, log_lib_size, xlim=c(1,8000)) ``` @@ -303,7 +307,7 @@ That's better, the "knee" in the distribution is much more pronounced. We could manually estimate where the "knee" is but it much more reproducible to algorithmically identify this point. -```{r} +```{r process-raw14} # inflection point o <- order(barcode_rank) log_lib_size <- log_lib_size[o] @@ -325,7 +329,7 @@ c(TPR, Recall) Another is to fix a mixture model and find where the higher and lower distributions intersect. However, data may not fit the assumed distributions very well: -```{r} +```{r process-raw15} set.seed(-92497) # mixture model require("mixtools") @@ -344,7 +348,7 @@ Identify cells using this split point and calculate the TPR and Recall. __Answer__ -```{r, include=FALSE} +```{r process-raw16, include=FALSE} cells <- umi_per_barcode[umi_per_barcode[,2] > 10^split,1] TPR <- sum(cells %in% truth[,1])/length(cells) @@ -355,7 +359,7 @@ c(TPR, Recall) A third, used by CellRanger, assumes a ~10-fold range of library sizes for real cells and estimates this range using the expected number of cells. -```{r} +```{r process-raw17} n_cells <- length(truth[,1]) # CellRanger totals <- umi_per_barcode[,2] @@ -369,7 +373,7 @@ __Exercise__ Identify cells using this threshodl and calculate the TPR and Recall. __Answer__ -```{r, include=FALSE} +```{r process-raw18, include=FALSE} cells <- umi_per_barcode[umi_per_barcode[,2] > thresh,1] TPR <- sum(cells %in% truth[,1])/length(cells) @@ -389,7 +393,7 @@ cells in highly diverse samples. Below we have provided code for how this method is currently run: (We will update this page when the method is officially released) -```{r, eval=FALSE} +```{r process-raw19, eval=FALSE} require("Matrix") raw.counts <- readRDS("data/droplet_id_example.rds") diff --git a/course_files/scater.Rmd b/course_files/scater.Rmd index 60c861f60..b9e65ae4b 100644 --- a/course_files/scater.Rmd +++ b/course_files/scater.Rmd @@ -4,9 +4,9 @@ output: html_document ## Bioconductor, `SingleCellExperiment` and `scater` -```{r, echo=FALSE} +```{r scater0, echo=FALSE} library(knitr) -opts_chunk$set(fig.align = "center") +opts_chunk$set(fig.align = "center", cache=TRUE, cache.extra = list(R.version, sessionInfo())) set.seed(1) ``` @@ -22,7 +22,7 @@ We strongly recommend all new comers and even experienced high-throughput data a [`SingleCellExperiment`](http://bioconductor.org/packages/SingleCellExperiment) (SCE) is a S4 class for storing data from single-cell experiments. This includes specialized methods to store and retrieve spike-in information, dimensionality reduction coordinates and size factors for each cell, along with the usual metadata for genes and libraries. In practice, an object of this class can be created using its constructor: -```{r message=FALSE, warning=FALSE} +```{r scater1, message=FALSE, warning=FALSE} library(SingleCellExperiment) counts <- matrix(rpois(100, lambda = 10), ncol=10, nrow=10) rownames(counts) <- paste("gene", 1:10, sep = "") @@ -45,7 +45,7 @@ In the `SingleCellExperiment`, users can assign arbitrary names to entries of as Each of these suggested names has an appropriate getter/setter method for convenient manipulation of the `SingleCellExperiment`. For example, we can take the (very specifically named) `counts` slot, normalise it and assign it to `normcounts` instead: -```{r} +```{r scater2} normcounts(sce) <- log2(counts(sce) + 1) sce dim(normcounts(sce)) diff --git a/course_files/umis.Rmd b/course_files/umis.Rmd index 537068bab..70343ce93 100644 --- a/course_files/umis.Rmd +++ b/course_files/umis.Rmd @@ -2,9 +2,9 @@ output: html_document --- -```{r, echo=FALSE} +```{r umis0, echo=FALSE, cache=TRUE, cache.extra = list(R.version, sessionInfo())} library(knitr) -opts_chunk$set(fig.align = "center", echo=FALSE) +opts_chunk$set(fig.align = "center", echo=FALSE, cache=TRUE, cache.extra = list(R.version, sessionInfo())) ``` ## Unique Molecular Identifiers (UMIs) {#umichapter} @@ -15,7 +15,7 @@ Thanks to Andreas Buness from [EMBL Monterotondo](https://www.embl.it/services/b Unique Molecular Identifiers are short (4-10bp) random barcodes added to transcripts during reverse-transcription. They enable sequencing reads to be assigned to individual transcript molecules and thus the removal of amplification noise and biases from scRNASeq data. -```{r intro-umi-protocol, out.width = '90%', fig.cap="UMI sequencing protocol"} +```{r umis1, out.width = '90%', fig.cap="UMI sequencing protocol"} knitr::include_graphics("figures/UMI-Seq-protocol.png") ``` @@ -34,7 +34,7 @@ After processing the reads from a UMI experiment, the following conventions are 2. Reads are sorted into separate files by cell barcode + For extremely large, shallow datasets, the cell barcode may be added to the read name as well to reduce the number of files. -```{r intro-umi-reads, out.width = '90%', fig.cap="UMI sequencing reads, red lightning bolts represent different fragmentation locations"} +```{r umis2, out.width = '90%', fig.cap="UMI sequencing reads, red lightning bolts represent different fragmentation locations"} knitr::include_graphics("figures/UMI-Seq-reads.png") ``` @@ -51,7 +51,7 @@ In theory, every unique UMI-transcript pair should represent all reads originati 3. __Same UMI does not necessarily mean same molecule__ + Biases in UMI frequency and short UMIs can result in the same UMI being attached to different mRNA molecules from the same gene. Thus, the number of transcripts may be underestimated. -```{r intro-umi-errors, out.width = '90%', fig.cap="Potential Errors in UMIs"} +```{r umis3, out.width = '90%', fig.cap="Potential Errors in UMIs"} knitr::include_graphics("figures/UMI-Seq-errors.png") ``` @@ -70,7 +70,7 @@ where N = total number of unique UMI barcodes and n = number of observed barcode An important caveat of this method is that it assumes that all UMIs are equally frequent. In most cases this is incorrect, since there is often a bias related to the GC content. -```{r intro-umi-amp, out.width = '60%', fig.cap="Per gene amplification rate"} +```{r umis4, out.width = '60%', fig.cap="Per gene amplification rate"} knitr::include_graphics("figures/UMI-Seq-amp.png") ``` @@ -85,7 +85,7 @@ Determining how to best process and use UMIs is currently an active area of rese Current UMI platforms (DropSeq, InDrop, ICell8) exhibit low and highly variable capture efficiency as shown in the figure below. -```{r intro-umi-capture, out.width = '70%', fig.cap="Variability in Capture Efficiency"} +```{r umis5, out.width = '70%', fig.cap="Variability in Capture Efficiency"} knitr::include_graphics("figures/UMI-Seq-capture.png") ``` @@ -93,7 +93,7 @@ This variability can introduce strong biases and it needs to be considered in do __Exercise 1__ We have provided you with UMI counts and read counts from induced pluripotent stem cells generated from three different individuals [@Tung2017-ba] (see: Chapter \@ref(exprs-qc) for details of this dataset). -```{r, echo=TRUE, include=TRUE} +```{r umis6, echo=TRUE, include=TRUE} umi_counts <- read.table("data/tung/molecules.txt", sep = "\t") read_counts <- read.table("data/tung/reads.txt", sep = "\t") ``` @@ -103,7 +103,7 @@ Using this data: 2. Determine the amplification rate: average number of reads per UMI. -```{r, include=FALSE} +```{r umis7, include=FALSE} # Exercise 1 # Part 1 plot(colSums(umi_counts), colSums(umi_counts > 0), xlab="Total Molecules Detected", ylab="Total Genes Detected") From 5247358243cf53feca761d7a0aa9dc7fa3a2d878 Mon Sep 17 00:00:00 2001 From: wikiselev Date: Tue, 6 Aug 2019 15:54:48 +0100 Subject: [PATCH 2/4] fix misplaced echo argument --- course_files/exp-methods.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/course_files/exp-methods.Rmd b/course_files/exp-methods.Rmd index 37f8b8fe8..7078c4e98 100644 --- a/course_files/exp-methods.Rmd +++ b/course_files/exp-methods.Rmd @@ -10,7 +10,7 @@ opts_chunk$set(fig.align = "center", echo=FALSE, out.width = '70%', cache=TRUE, ## Experimental methods ```{r exp-methods1, fig.cap="Moore's law in single cell transcriptomics (image taken from [Svensson et al](https://arxiv.org/abs/1704.01379))", out.width = '100%'} -knitr::include_graphics("figures/moores-law.png", cache=TRUE) +knitr::include_graphics("figures/moores-law.png") ``` Development of new methods and protocols for scRNA-seq is currently a very active area of research, and several protocols have been published over the last few years. A non-comprehensive list includes: From 7aa72cbf27bae8dde2c2f119da7ba457b069d944 Mon Sep 17 00:00:00 2001 From: Cellgeni service Date: Fri, 9 Oct 2020 11:42:38 +0100 Subject: [PATCH 3/4] All Simons Changes --- build-instructions.md | 68 ++++++++++++++++++++++++++++++++++ conf/sanger-singularity.config | 9 +++++ 2 files changed, 77 insertions(+) create mode 100644 build-instructions.md create mode 100644 conf/sanger-singularity.config diff --git a/build-instructions.md b/build-instructions.md new file mode 100644 index 000000000..7aa904b3a --- /dev/null +++ b/build-instructions.md @@ -0,0 +1,68 @@ +## Instructions for Building Course + +### Clone Repository +In order to download the course. Enter the following command into the directory you want the course to be downloaded into: +``` +git clone https://github.com/cellgeni/scrnaseq-course-private.git +``` + +### Installing the Image +The course uses a docker image within a singularity environment. In order to ensure you have all the correct +dependencies installed please download the v4.07 [docker image](https://quay.io/repository/hemberg-group/scrna-seq-course?tab=tags). + +A specific [version of singularity](https://github.com/hpcng/singularity/tree/v3.5.3) is needed. + +There are also [instructions](https://github.com/hpcng/singularity/blob/v3.5.3/INSTALL.md) for installing singularity. + +The software nextflow is also used to build the course which has its own [installation instructions](https://www.nextflow.io/docs/latest/getstarted.html). + +### How to Build the Course +In order to build the course and generate new cache files please input the following code into a file (i.e. run-course): +``` +vi run-course +``` +then copy the following code into the file: +``` +#!/bin/bash + +source=cellgeni/scrnaseq-course-private +source=/path-to-current-directory/scrnaseq-course-private/main.nf + +export PATH=$PATH:/path-to-installed-singularity-software/singularity-v3.5.3/bin/ + +set -euo pipefail + +nextflow run $source -profile singularity -with-report reports/report.html -resume -ansi-log false +``` + +To then run this code, use the following command: +``` +/path-to-directory-containing-run-course-file/run-course +``` + +Or if the file is in your current directory then you can use: +``` +./run-course +``` + +This should build the course. The work directory will be provided at the end and the newly built +cache files will be located at: +``` +/path-to-work-dir/course_work_dir/_bookdown_files/ +``` + +### How to Upload Newly Built Cache to Amazon S3 Bucket +If you need to upload new cache to the github repo then you will need the AWS Access Key ID and +AWS Secret Access Key (not provided here). + +Then you need to start a singularity shell using the following command: +``` +SINGULARITYENV_AWS_ACCESS_KEY_ID=NOT-PROVIDED \ +SINGULARITYENV_AWS_SECRET_ACCESS_KEY=STILL-NOT-PROVIDED \ +/path-to-installed-singularity-software/singularity-v3.5.3/bin/singularity shell -B /any-paths-that-need-to-be mounted /path-to-docker-images/quay.io-hemberg-group-scrna-seq-course-v4.07.img +``` + +Once you have the shell started use the following command to upload new cache: +``` +aws s3 sync /path-to-work-dir/course_work_dir/_bookdown_files/ s3://scrnaseq-course/_bookdown_files/ +``` diff --git a/conf/sanger-singularity.config b/conf/sanger-singularity.config new file mode 100644 index 000000000..e58300b96 --- /dev/null +++ b/conf/sanger-singularity.config @@ -0,0 +1,9 @@ +env { + http_proxy = 'http://wwwcache.sanger.ac.uk:3128' + https_proxy = 'http://wwwcache.sanger.ac.uk:3128' +} + +singularity { + cacheDir = '/nfs/cellgeni/singularity/images_vlad/' + runOptions = '-B /lustre/ -B /nfs/users/nfs_c/cellgeni-su' +} From ff4276fa7cb4f5d12846e7a8c1bc51a2f729a545 Mon Sep 17 00:00:00 2001 From: Cellgeni service Date: Fri, 9 Oct 2020 11:44:55 +0100 Subject: [PATCH 4/4] All Simons Changes --- .gitignore | 1 + README.md | 2 +- conf/base.config | 8 -- conf/singularity.config | 6 -- course_files/Intro-TabulaMuris.Rmd | 64 ++++++------ course_files/Intro-to-Bioconductor.Rmd | 25 ++--- course_files/_bookdown.yml | 38 +++---- course_files/advanced.Rmd | 7 +- course_files/clust-intro.Rmd | 14 +-- course_files/clustering.Rmd | 50 +++++----- course_files/confounders-reads.Rmd | 12 +-- course_files/confounders.Rmd | 14 +-- course_files/de-intro.Rmd | 10 +- course_files/de-real.Rmd | 38 +++---- course_files/dropouts.Rmd | 54 +++++----- course_files/exp-methods.Rmd | 16 +-- course_files/exprs-constr.Rmd | 10 +- course_files/exprs-norm-reads.Rmd | 34 +++---- course_files/exprs-norm.Rmd | 120 +++++++++++------------ course_files/exprs-overview-reads.Rmd | 22 ++--- course_files/exprs-overview.Rmd | 28 +++--- course_files/exprs-qc-reads.Rmd | 62 ++++++------ course_files/exprs-qc.Rmd | 58 +++++------ course_files/ggplot2-pheatmap-PCA.Rmd | 19 ++-- course_files/ideal-scrnaseq-pipeline.Rmd | 6 +- course_files/imputation.Rmd | 24 ++--- course_files/intro-to-R.Rmd | 60 ++++++------ course_files/intro.Rmd | 8 +- course_files/process-raw-QC.Rmd | 12 +-- course_files/process-raw-align.Rmd | 10 +- course_files/process-raw.Rmd | 40 ++++---- course_files/projection.Rmd | 110 ++++++++++----------- course_files/pseudotime.Rmd | 68 ++++++------- course_files/remove-conf-reads.Rmd | 32 +++--- course_files/remove-conf.Rmd | 32 +++--- course_files/scater.Rmd | 2 +- course_files/search.Rmd | 49 ++++----- course_files/seurat.Rmd | 80 +++++++-------- course_files/umis.Rmd | 18 ++-- main.nf | 61 ++++++++---- nextflow.config | 1 + 41 files changed, 658 insertions(+), 667 deletions(-) diff --git a/.gitignore b/.gitignore index eb8382601..8b78664f0 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ nextflow_output .DS_* */.DS_* .nextflow* +/course_files/website diff --git a/README.md b/README.md index cacb09b0b..b32ae1988 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ The number of computational tools is increasing rapidly and we are doing our bes ## Web page -__[https://scrnaseq-course.cog.sanger.ac.uk/website/index.html](https://scrnaseq-course.cog.sanger.ac.uk/website/index.html)__ +__[https://scrnaseq-course.cog.sanger.ac.uk/browser.html?shared=data/](https://scrnaseq-course.cog.sanger.ac.uk/browser.html?shared=data/)__ ## Video diff --git a/conf/base.config b/conf/base.config index 8d0c05761..b71121ff7 100644 --- a/conf/base.config +++ b/conf/base.config @@ -5,11 +5,3 @@ process { cpus = 2 memory = 16.GB } - -// sanger S3 configuration -aws { - client { - endpoint = "https://cog.sanger.ac.uk" - signerOverride = "S3SignerType" - } -} diff --git a/conf/singularity.config b/conf/singularity.config index d09cd7f94..74cbed2e8 100644 --- a/conf/singularity.config +++ b/conf/singularity.config @@ -1,13 +1,7 @@ docker.enabled = false -env { - http_proxy = 'http://wwwcache.sanger.ac.uk:3128' - https_proxy = 'http://wwwcache.sanger.ac.uk:3128' -} - singularity { enabled = true autoMounts = true - cacheDir = '/nfs/cellgeni/singularity/images_vlad/' } diff --git a/course_files/Intro-TabulaMuris.Rmd b/course_files/Intro-TabulaMuris.Rmd index 5578938c5..0193b137b 100644 --- a/course_files/Intro-TabulaMuris.Rmd +++ b/course_files/Intro-TabulaMuris.Rmd @@ -2,7 +2,7 @@ output: html_document --- -```{r Intro-TabulaMuris0, echo=FALSE} +```{r intro-tab0, echo=FALSE, cache=TRUE, cache.extra = list(R.version, sessionInfo())} library(knitr) opts_chunk$set(cache=TRUE, cache.extra = list(R.version, sessionInfo())) ``` @@ -22,9 +22,11 @@ Unlike most single-cell RNASeq data Tabula Muris has release their data through Terminal-based download of FACS data: -```{bash Intro-TabulaMuris1, message=FALSE, warning=FALSE, results='hide'} +```{bash intro-tab1, message=FALSE, warning=FALSE, results='hide'} +echo $http_proxy +echo $https_proxy wget https://ndownloader.figshare.com/files/10038307 -unzip 10038307 +unzip -o 10038307 wget https://ndownloader.figshare.com/files/10038310 mv 10038310 FACS_metadata.csv wget https://ndownloader.figshare.com/files/10039267 @@ -32,9 +34,9 @@ mv 10039267 FACS_annotations.csv ``` Terminal-based download of 10X data: -```{bash Intro-TabulaMuris2, message=FALSE, warning=FALSE, results='hide'} +```{bash intro-tab2, message=FALSE, warning=FALSE, results='hide'} wget https://ndownloader.figshare.com/files/10038325 -unzip 10038325 +unzip -o 10038325 wget https://ndownloader.figshare.com/files/10038328 mv 10038328 droplet_metadata.csv wget https://ndownloader.figshare.com/files/10039264 @@ -44,11 +46,11 @@ mv 10039264 droplet_annotation.csv Note if you download the data by hand you should unzip & rename the files as above before continuing. You should now have two folders : "FACS" and "droplet" and one annotation and metadata file for each. To inspect these files you can use the `head` to see the top few lines of the text files (Press "q" to exit): -```{bash Intro-TabulaMuris3} +```{bash intro-tab3} head -n 10 droplet_metadata.csv ``` You can also check the number of rows in each file using: -```{bash Intro-TabulaMuris4} +```{bash intro-tab4} wc -l droplet_annotation.csv ``` @@ -63,13 +65,13 @@ Droplet : 42,193 cells We can now read in the relevant count matrix from the comma-separated file. Then inspect the resulting dataframe: -```{r Intro-TabulaMuris5} +```{r intro-tab5} dat = read.delim("FACS/Kidney-counts.csv", sep=",", header=TRUE) dat[1:5,1:5] ``` We can see that the first column in the dataframe is the gene names, so first we move these to the rownames so we have a numeric matrix: -```{r Intro-TabulaMuris6} +```{r intro-tab6} dim(dat) rownames(dat) <- dat[,1] dat <- dat[,-1] @@ -77,13 +79,13 @@ dat <- dat[,-1] Since this is a Smartseq2 dataset it may contain spike-ins so lets check: -```{r Intro-TabulaMuris7} +```{r intro-tab7} rownames(dat)[grep("^ERCC-", rownames(dat))] ``` Now we can extract much of the metadata for this data from the column names: -```{r Intro-TabulaMuris8} +```{r intro-tab8} cellIDs <- colnames(dat) cell_info <- strsplit(cellIDs, "\\.") Well <- lapply(cell_info, function(x){x[1]}) @@ -93,19 +95,19 @@ Mouse <- unlist(lapply(cell_info, function(x){x[3]})) ``` We can check the distributions of each of these metadata classifications: -```{r Intro-TabulaMuris9} +```{r intro-tab9} summary(factor(Mouse)) ``` We can also check if any technical factors are confounded: -```{r Intro-TabulaMuris10} +```{r intro-tab10} table(Mouse, Plate) ``` Lastly we will read the computationally inferred cell-type annotation and match them to the cell in our expression matrix: -```{r Intro-TabulaMuris11} +```{r intro-tab11} ann <- read.table("FACS_annotations.csv", sep=",", header=TRUE) ann <- ann[match(cellIDs, ann[,1]),] celltype <- ann[,3] @@ -114,7 +116,7 @@ celltype <- ann[,3] ## Building a scater object To create a SingleCellExperiment object we must put together all the cell annotations into a single dataframe, since the experimental batch (pcr plate) is completely confounded with donor mouse we will only keep one of them. -```{r Intro-TabulaMuris12, message=FALSE, warning=FALSE} +```{r intro-tab12, message=FALSE, warning=FALSE} library("SingleCellExperiment") library("scater") cell_anns <- data.frame(mouse = Mouse, well=Well, type=celltype) @@ -123,7 +125,7 @@ sceset <- SingleCellExperiment(assays = list(counts = as.matrix(dat)), colData=c ``` Finally if the dataset contains spike-ins we a hidden variable in the SingleCellExperiment object to track them: -```{r Intro-TabulaMuris13} +```{r intro-tab13} isSpike(sceset, "ERCC") <- grepl("ERCC-", rownames(sceset)) ``` @@ -139,49 +141,49 @@ respectively. We will be using the "Matrix" package to store matrices in sparse-matrix format in R. -```{r Intro-TabulaMuris14} +```{r intro-tab14} library("Matrix") cellbarcodes <- read.table("droplet/Kidney-10X_P4_5/barcodes.tsv") genenames <- read.table("droplet/Kidney-10X_P4_5/genes.tsv") molecules <- readMM("droplet/Kidney-10X_P4_5/matrix.mtx") ``` Now we will add the appropriate row and column names. However, if you inspect the read cellbarcodes you will see that they are just the barcode sequence associated with each cell. This is a problem since each batch of 10X data uses the same pool of barcodes so if we need to combine data from multiple 10X batches the cellbarcodes will not be unique. Hence we will attach the batch ID to each cell barcode: -```{r Intro-TabulaMuris15} +```{r intro-tab15} head(cellbarcodes) ``` -```{r Intro-TabulaMuris16} +```{r intro-tab16} rownames(molecules) <- genenames[,1] colnames(molecules) <- paste("10X_P4_5", cellbarcodes[,1], sep="_") ``` Now lets get the metadata and computational annotations for this data: -```{r Intro-TabulaMuris17} +```{r intro-tab17} meta <- read.delim("droplet_metadata.csv", sep=",", header = TRUE) head(meta) ``` Here we can see that we need to use "10X_P4_5" to find the metadata for this batch, also note that the format of the mouse ID is different in this metadata table with hyphens instead of underscores and with the gender in the middle of the ID. From checking the methods section of the accompanying paper we know that the same 8 mice were used for both droplet and plate-based techniques. So we need to fix the mouse IDs to be consistent with those used in the FACS experiments. -```{r Intro-TabulaMuris18} +```{r intro-tab18} meta[meta$channel == "10X_P4_5",] mouseID <- "3_8_M" ``` Note: depending on the tissue you have been assigned you may have 10X data from mixed samples : e.g. mouse id = 3-M-5/6. You should still reformat these to be consistent but they will not match mouse ids from the FACS data which may affect your downstream analysis. If the mice weren't from an inbred strain it would be possible to assign individual cells to a specific mouse using exonic-SNPs but that is beyond the scope of this course. -```{r Intro-TabulaMuris19} +```{r intro-tab19} ann <- read.delim("droplet_annotation.csv", sep=",", header=TRUE) head(ann) ``` Again you will find a slight formating difference between the cellID in the annotation and the cellbarcodes which we will have to correct before matching them. -```{r Intro-TabulaMuris20} +```{r intro-tab20} ann[,1] <- paste(ann[,1], "-1", sep="") ann_subset <- ann[match(colnames(molecules), ann[,1]),] celltype <- ann_subset[,3] ``` Now lets build the cell-metadata dataframe: -```{r Intro-TabulaMuris21} +```{r intro-tab21} cell_anns <- data.frame(mouse = rep(mouseID, times=ncol(molecules)), type=celltype) rownames(cell_anns) <- colnames(molecules); ``` @@ -189,7 +191,7 @@ rownames(cell_anns) <- colnames(molecules); __Exercise__ Repeat the above for the other 10X batches for your tissue. __Answer__ -```{r Intro-TabulaMuris22, echo=FALSE, eval=TRUE} +```{r intro-tab22, echo=FALSE, eval=TRUE} molecules1 <- molecules cell_anns1 <- cell_anns @@ -226,13 +228,13 @@ cell_anns3 <- cell_anns Now that we have read the 10X data in multiple batches we need to combine them into a single SingleCellExperiment object. First we will check that the gene names are the same and in the same order across all batches: -```{r Intro-TabulaMuris23} +```{r intro-tab23} identical(rownames(molecules1), rownames(molecules2)) identical(rownames(molecules1), rownames(molecules3)) ``` Now we'll check that there aren't any repeated cellIDs: -```{r Intro-TabulaMuris24} +```{r intro-tab24} sum(colnames(molecules1) %in% colnames(molecules2)) sum(colnames(molecules1) %in% colnames(molecules3)) sum(colnames(molecules2) %in% colnames(molecules3)) @@ -240,7 +242,7 @@ sum(colnames(molecules2) %in% colnames(molecules3)) Everything is ok, so we can go ahead and combine them: -```{r Intro-TabulaMuris25} +```{r intro-tab25} all_molecules <- cbind(molecules1, molecules2, molecules3) all_cell_anns <- as.data.frame(rbind(cell_anns1, cell_anns2, cell_anns3)) all_cell_anns$batch <- rep(c("10X_P4_5", "10X_P4_6","10X_P7_5"), times = c(nrow(cell_anns1), nrow(cell_anns2), nrow(cell_anns3))) @@ -250,13 +252,13 @@ __Exercise__ How many cells are in the whole dataset? __Answer__ -```{r Intro-TabulaMuris26, echo=FALSE, eval=FALSE} +```{r intro-tab26, echo=FALSE, eval=FALSE} dim(all_molecules)[2] ``` Now build the SingleCellExperiment object. One of the advantages of the SingleCellExperiment class is that it is capable of storing data in normal matrix or sparse matrix format, as well as HDF5 format which allows large non-sparse matrices to be stored & accessed on disk in an efficient manner rather than loading the whole thing into RAM. -```{r Intro-TabulaMuris27} +```{r intro-tab27} all_molecules <- as.matrix(all_molecules) sceset <- SingleCellExperiment( assays = list(counts = as.matrix(all_molecules)), @@ -265,7 +267,7 @@ sceset <- SingleCellExperiment( ``` Since this is 10X data it will not contain spike-ins, so we just save the data: -```{r Intro-TabulaMuris28} +```{r intro-tab28} saveRDS(sceset, "kidney_droplet.rds") ``` diff --git a/course_files/Intro-to-Bioconductor.Rmd b/course_files/Intro-to-Bioconductor.Rmd index a0044f393..a86015e83 100644 --- a/course_files/Intro-to-Bioconductor.Rmd +++ b/course_files/Intro-to-Bioconductor.Rmd @@ -2,11 +2,6 @@ output: html_document --- -```{r intro-to-Bioconductor0, echo=FALSE, cache=TRUE, cache.extra = list(R.version, sessionInfo())} -library(knitr) -opts_chunk$set(cache=TRUE, cache.extra = list(R.version, sessionInfo())) -``` - ## Data Types ### What is Tidy Data? @@ -18,12 +13,12 @@ Tidy data is a concept largely defined by Hadley Wickham [@wickham_2014]. Tidy d 3. Each value has its own cell. Here is an example of some tidy data: -```{R intro-to-Bioconductor1,eval=TRUE, echo=FALSE} +```{R Intro-bio0, eval=TRUE, echo=FALSE} data.frame(Students=c("Mark", "Jane", "Mohammed", "Tom", "Celia"), Subject=c("Maths", "Biology", "Physics", "Maths", "Computing"), Years=c(1,2,3,2,3), Score=c(5,6,4,7,9)) ``` Here is an example of some untidy data: -```{R intro-to-Bioconductor2, eval=TRUE, echo=FALSE} +```{R Intro-bio1, eval=TRUE, echo=FALSE} data.frame(Students=c("Matt", "Matt", "Ellie", "Ellie", "Tim", "Tim", "Louise", "Louise", "Kelly", "Kelly"), Sport=c("Tennis","Tennis", "Rugby", "Rugby","Football", "Football","Swimming","Swimming", "Running", "Running"), Category=c("Wins", "Losses", "Wins", "Losses", "Wins", "Losses", "Wins", "Losses", "Wins", "Losses"), Counts=c(0,1,3,2,1,4,2,2,5,1)) ``` @@ -33,7 +28,7 @@ Tidy data is generally easier to work with than untidy data, especially if you a The untidy data above is untidy because two variables (`Wins` and `Losses`) are stored in one column (`Category`). This is a common way in which data can be untidy. To tidy this data, we need to make `Wins` and `Losses` into columns, and store the values in `Counts` in these columns. Fortunately, there is a function from the tidyverse packages to perform this operation. The function is called `spread`, and it takes two arguments, `key` and `value`. You should pass the name of the column which contains multiple variables to `key`, and pass the name of the column which contains values from multiple variables to `value`. For example: -```{R intro-to-Bioconductor3, echo=TRUE, eval=TRUE, message=FALSE} +```{R Intro-bio2, echo=TRUE, eval=TRUE, message=FALSE} library(tidyverse) sports<-data.frame(Students=c("Matt", "Matt", "Ellie", "Ellie", "Tim", "Tim", "Louise", "Louise", "Kelly", "Kelly"), Sport=c("Tennis","Tennis", "Rugby", "Rugby","Football", "Football","Swimming","Swimming", "Running", "Running"), Category=c("Wins", "Losses", "Wins", "Losses", "Wins", "Losses", "Wins", "Losses", "Wins", "Losses"), Counts=c(0,1,3,2,1,4,2,2,5,1)) sports @@ -43,19 +38,19 @@ spread(sports, key=Category, value=Counts) Task 2: The dataframe `foods` defined below is untidy. Work out why and use `spread()` to tidy it -```{R intro-to-Bioconductor4, echo=TRUE, eval=TRUE} +```{R Intro-bio3, echo=TRUE, eval=TRUE} foods<-data.frame(student=c("Antoinette","Antoinette","Taylor", "Taylor", "Alexa", "Alexa"), Category=c("Dinner", "Dessert", "Dinner", "Dessert", "Dinner","Dessert"), Frequency=c(3,1,4,5,2,1)) ``` The other common way in which data can be untidy is if the columns are values instead of variables. For example, the dataframe below shows the percentages some students got in tests they did in May and June. The data is untidy because the columns `May` and `June` are values, not variables. -```{R intro-to-Bioconductor5} +```{R Intro-bio4} percentages<-data.frame(student=c("Alejandro", "Pietro", "Jane"), "May"=c(90,12,45), "June"=c(80,30,100)) ``` Fortunately, there is a function in the tidyverse packages to deal with this problem too. `gather()` takes the names of the columns which are values, the `key` and the `value` as arguments. This time, the `key` is the name of the variable with values as column names, and the `value` is the name of the variable with values spread over multiple columns. Ie: -```{R intro-to-Bioconductor6} +```{R Intro-bio5} gather(percentages, "May", "June", key="Month", value = "Percentage") ``` @@ -65,10 +60,10 @@ These examples don't have much to do with single-cell RNA-seq analysis, but are If you google 'rich data', you will find lots of different definitions for this term. In this course, we will use 'rich data' to mean data which is generated by combining information from multiple sources. For example, you could make rich data by creating an object in R which contains a matrix of gene expression values across the cells in your single-cell RNA-seq experiment, but also information about how the experiment was performed. Objects of the `SingleCellExperiment` class, which we will discuss below, are an example of rich data. -```{r intro-to-Bioconductor7, echo=FALSE, message=FALSE} +```{r Intro-bio6, echo=FALSE, message=FALSE} library(knitr) library(scater) -opts_chunk$set(fig.align = "center") +opts_chunk$set(cache = TRUE, fig.align = "center") set.seed(1) ``` @@ -84,7 +79,7 @@ We strongly recommend all new comers and even experienced high-throughput data a [`SingleCellExperiment`](http://bioconductor.org/packages/SingleCellExperiment) (SCE) is a S4 class for storing data from single-cell experiments. This includes specialized methods to store and retrieve spike-in information, dimensionality reduction coordinates and size factors for each cell, along with the usual metadata for genes and libraries. In practice, an object of this class can be created using its constructor: -```{r intro-to-Bioconductor8, message=FALSE, warning=FALSE} +```{r Intro-bio7, message=FALSE, warning=FALSE} library(SingleCellExperiment) counts <- matrix(rpois(100, lambda = 10), ncol=10, nrow=10) rownames(counts) <- paste("gene", 1:10, sep = "") @@ -107,7 +102,7 @@ In the `SingleCellExperiment`, users can assign arbitrary names to entries of as Each of these suggested names has an appropriate getter/setter method for convenient manipulation of the `SingleCellExperiment`. For example, we can take the (very specifically named) `counts` slot, normalise it and assign it to `normcounts` instead: -```{r intro-to-Bioconductor9} +```{r Intro-bio8} normcounts(sce) <- log2(counts(sce) + 1) sce dim(normcounts(sce)) diff --git a/course_files/_bookdown.yml b/course_files/_bookdown.yml index 1faa87e4b..dd274ca84 100644 --- a/course_files/_bookdown.yml +++ b/course_files/_bookdown.yml @@ -1,28 +1,20 @@ book_filename: "scRNA-seq-course" chapter_name: "" output_dir: website +always_allow_html: yes new_session: yes -rmd_files: ["index.Rmd", -"intro.Rmd", -"exp-methods.Rmd", -"process-raw-QC.Rmd", "process-raw.Rmd", "process-raw-align.Rmd", -"exprs-constr.Rmd", "umis.Rmd", -"intro-to-R.Rmd", "Intro-to-Bioconductor.Rmd", "scater.Rmd", "ggplot2-pheatmap-PCA.Rmd", -"Intro-TabulaMuris.Rmd", +rmd_files: [ +"index.Rmd", "intro.Rmd", "exp-methods.Rmd", +"process-raw-QC.Rmd", "process-raw.Rmd", +"process-raw-align.Rmd", "exprs-constr.Rmd", +"umis.Rmd", "intro-to-R.Rmd", "Intro-to-Bioconductor.Rmd", +"scater.Rmd", "ggplot2-pheatmap-PCA.Rmd", "Intro-TabulaMuris.Rmd", "exprs-qc.Rmd", "exprs-qc-reads.Rmd", "exprs-overview.Rmd", -"exprs-overview-reads.Rmd", -"confounders.Rmd", "confounders-reads.Rmd", -"exprs-norm.Rmd", "exprs-norm-reads.Rmd", -"remove-conf.Rmd", "remove-conf-reads.Rmd", -"clust-intro.Rmd", "clustering.Rmd", -"dropouts.Rmd", -"pseudotime.Rmd", -"imputation.Rmd", -"de-intro.Rmd", "de-real.Rmd", -"projection.Rmd", -"search.Rmd", -"seurat.Rmd", -"ideal-scrnaseq-pipeline.Rmd", -"advanced.Rmd", -"tools.Rmd", -"references.Rmd"] +"exprs-overview-reads.Rmd", "confounders.Rmd", +"confounders-reads.Rmd", "exprs-norm.Rmd", "exprs-norm-reads.Rmd", +"remove-conf.Rmd", "remove-conf-reads.Rmd", "clust-intro.Rmd", +"clustering.Rmd", "dropouts.Rmd", "pseudotime.Rmd", "imputation.Rmd", +"de-intro.Rmd", "de-real.Rmd", "projection.Rmd", +"search.Rmd", "seurat.Rmd", "ideal-scrnaseq-pipeline.Rmd", +"advanced.Rmd", "tools.Rmd", "references.Rmd" +] diff --git a/course_files/advanced.Rmd b/course_files/advanced.Rmd index 0ef98ce67..828cacd48 100644 --- a/course_files/advanced.Rmd +++ b/course_files/advanced.Rmd @@ -2,6 +2,11 @@ output: html_document --- +```{r advanced0, include=FALSE, cache=TRUE, cache.extra = list(R.version, sessionInfo())} +library(knitr) +opts_chunk$set(cache=TRUE, cache.extra = list(R.version, sessionInfo())) +``` + # Advanced exercises For the final part of the course we would like you to work on more open ended problems. The goal is to carry out the type of analyses that you would be doing for an actual research project. @@ -27,7 +32,7 @@ The `MultiAssayExperiment` also contains the phenotypic data (in the `colData` s Here we will show you how to create an `SCE` from a `MultiAssayExperiment` object. For example, if you download `Shalek2013` dataset you will be able to create an `SCE` using the following code: -```{r, message=FALSE, warning=FALSE, eval=FALSE} +```{r advanced1, message=FALSE, warning=FALSE, eval=FALSE} library(MultiAssayExperiment) library(SummarizedExperiment) library(scater) diff --git a/course_files/clust-intro.Rmd b/course_files/clust-intro.Rmd index 03f459032..bf1b8881e 100644 --- a/course_files/clust-intro.Rmd +++ b/course_files/clust-intro.Rmd @@ -6,9 +6,9 @@ output: html_document ## Clustering Introduction -```{r, echo=FALSE} +```{r clust-intro0, echo=FALSE} library(knitr) -opts_chunk$set(fig.align = "center", echo=FALSE) +opts_chunk$set(cache = TRUE, fig.align = "center", echo=FALSE) ``` Once we have normalized the data and removed confounders we can carry out analyses that are relevant to the biological questions at hand. The exact nature of the analysis depends on the dataset. Nevertheless, there are a few aspects that are useful in a wide range of contexts and we will be discussing some of them in the next few chapters. We will start with the clustering of scRNA-seq data. @@ -46,11 +46,11 @@ top-down approach. In the former case, each cell is initially assigned to its own cluster and pairs of clusters are subsequently merged to create a hieararchy: -```{r clust-hierarch-raw, out.width = '30%', fig.cap="Raw data"} +```{r clust-intro1, out.width = '30%', fig.cap="Raw data"} knitr::include_graphics("figures/hierarchical_clustering1.png") ``` -```{r clust-hierarch-dendr, out.width = '50%', fig.cap="The hierarchical clustering dendrogram"} +```{r clust-intro2, out.width = '50%', fig.cap="The hierarchical clustering dendrogram"} knitr::include_graphics("figures/hierarchical_clustering2.png") ``` @@ -65,7 +65,7 @@ In [_k_-means clustering](https://en.wikipedia.org/wiki/K-means_clustering), the different clusters. In an iterative manner, cluster centers are assigned and each cell is assigned to its nearest cluster: -```{r clust-k-means, out.width = '100%', fig.cap="Schematic representation of the k-means clustering"} +```{r clust-intro3, out.width = '100%', fig.cap="Schematic representation of the k-means clustering"} knitr::include_graphics("figures/k-means.png") ``` @@ -77,7 +77,7 @@ Over the last two decades there has been a lot of interest in analyzing networks in various domains. One goal is to identify groups or modules of nodes in a network. -```{r clust-graph, out.width = '100%', fig.cap="Schematic representation of the graph network"} +```{r clust-intro4, out.width = '100%', fig.cap="Schematic representation of the graph network"} knitr::include_graphics("figures/graph_network.jpg") ``` @@ -101,7 +101,7 @@ to scRNA-seq data by building a graph where each node represents a cell. Note th #### [SC3](http://bioconductor.org/packages/SC3/) -```{r clust-sc3, out.width = '100%', fig.cap="SC3 pipeline"} +```{r clust-intro5, out.width = '100%', fig.cap="SC3 pipeline"} knitr::include_graphics("figures/sc3.png") ``` diff --git a/course_files/clustering.Rmd b/course_files/clustering.Rmd index 8f6d05339..e753ea4f7 100644 --- a/course_files/clustering.Rmd +++ b/course_files/clustering.Rmd @@ -4,12 +4,12 @@ output: html_document ## Clustering example {#clust-methods} -```{r, echo=FALSE} +```{r clustering0, echo=FALSE} library(knitr) -opts_chunk$set(fig.align = "center") +opts_chunk$set(cache = TRUE, fig.align = "center") ``` -```{r, echo=TRUE, message=FALSE, warning=FALSE} +```{r clustering1, echo=TRUE, message=FALSE, warning=FALSE} library(pcaMethods) library(SC3) library(scater) @@ -24,18 +24,18 @@ To illustrate clustering of scRNA-seq data, we consider the `Deng` dataset of ce ### Deng dataset Let's load the data and look at it: -```{r} +```{r clustering2} deng <- readRDS("data/deng/deng-reads.rds") deng ``` Let's look at the cell type annotation: -```{r} +```{r clustering3} table(colData(deng)$cell_type2) ``` A simple PCA analysis already separates some strong cell types and provides some insights in the data structure: -```{r} +```{r clustering4} plotPCA(deng, colour_by = "cell_type2") ``` As you can see, the early cell types separate quite well, but the three blastocyst timepoints are more difficult to distinguish. @@ -45,55 +45,55 @@ As you can see, the early cell types separate quite well, but the three blastocy Let's run `SC3` clustering on the Deng data. The advantage of the `SC3` is that it can directly ingest a `SingleCellExperiment` object. Now let's image we do not know the number of clusters _k_ (cell types). `SC3` can estimate a number of clusters for you: -```{r} +```{r clustering5} deng <- sc3_estimate_k(deng) metadata(deng)$sc3$k_estimation ``` Interestingly, the number of cell types predicted by `SC3` is smaller than in the original data annotation. However, early, mid and late stages of different cell types together, we will have exactly 6 cell types. We store the merged cell types in `cell_type1` column of the `colData` slot: -```{r} +```{r clustering6} plotPCA(deng, colour_by = "cell_type1") ``` Now we are ready to run `SC3` (we also ask it to calculate biological properties of the clusters): -```{r} +```{r clustering7} deng <- sc3(deng, ks = 10, biology = TRUE, n_cores = 1) ``` `SC3` result consists of several different outputs (please look in [@Kiselev2016-bq] and [SC3 vignette](http://bioconductor.org/packages/release/bioc/vignettes/SC3/inst/doc/my-vignette.html) for more details). Here we show some of them: Consensus matrix: -```{r, fig.height=6} +```{r clustering8, fig.height=6} sc3_plot_consensus(deng, k = 10, show_pdata = "cell_type2") ``` Silhouette plot: -```{r, fig.height=9} +```{r clustering9, fig.height=9} sc3_plot_silhouette(deng, k = 10) ``` Heatmap of the expression matrix: -```{r, fig.height=6} +```{r clustering10, fig.height=6} sc3_plot_expression(deng, k = 10, show_pdata = "cell_type2") ``` Identified marker genes: -```{r, fig.height=11} +```{r clustering11, fig.height=11} sc3_plot_markers(deng, k = 10, show_pdata = "cell_type2") ``` PCA plot with highlighted `SC3` clusters: -```{r} +```{r clustering12} plotPCA(deng, colour_by = "sc3_10_clusters") ``` Compare the results of `SC3` clustering with the original publication cell type labels: -```{r} +```{r clustering13} adjustedRandIndex(colData(deng)$cell_type2, colData(deng)$sc3_10_clusters) ``` __Note__ `SC3` can also be run in an interactive `Shiny` session: -```{r, eval=FALSE} +```{r clustering14, eval=FALSE} sc3_interactive(deng) ``` @@ -112,7 +112,7 @@ __Note__ Due to direct calculation of distances `SC3` becomes very slow when the ### tSNE + kmeans [tSNE](https://lvdmaaten.github.io/tsne/) plots that we saw before (\@ref(visual-tsne)) when used the __scater__ package are made by using the [Rtsne](https://cran.r-project.org/web/packages/Rtsne/index.html) and [ggplot2](https://cran.r-project.org/web/packages/ggplot2/index.html) packages. Here we will do the same: -```{r clust-tsne, fig.cap = "tSNE map of the patient data"} +```{r clustering15, fig.cap = "tSNE map of the patient data"} deng <- runTSNE(deng, rand_seed = 1) plotTSNE(deng) ``` @@ -122,7 +122,7 @@ Note that all points on the plot above are black. This is different from what we Now we are going to apply _k_-means clustering algorithm to the cloud of points on the tSNE map. How many groups do you see in the cloud? We will start with $k=8$: -```{r clust-tsne-kmeans2, fig.cap = "tSNE map of the patient data with 8 colored clusters, identified by the k-means clustering algorithm"} +```{r clustering16, fig.cap = "tSNE map of the patient data with 8 colored clusters, identified by the k-means clustering algorithm"} colData(deng)$tSNE_kmeans <- as.character(kmeans(deng@reducedDims$TSNE, centers = 8)$clust) plotTSNE(deng, colour_by = "tSNE_kmeans") ``` @@ -132,7 +132,7 @@ __Exercise 7__: Make the same plot for $k=10$. __Exercise 8__: Compare the results between `tSNE+kmeans` and the original publication cell types. Can the results be improved by changing the `perplexity` parameter? __Our solution__: -```{r, echo=FALSE} +```{r clustering17, echo=FALSE} colData(deng)$tSNE_kmeans <- as.character(kmeans(deng@reducedDims$TSNE, centers = 10)$clust) adjustedRandIndex(colData(deng)$cell_type2, colData(deng)$tSNE_kmeans) ``` @@ -145,12 +145,12 @@ overview of the solutions, we need to run the methods multiple times. `SC3` is a As mentioned in the previous chapter [SINCERA](https://research.cchmc.org/pbge/sincera.html) is based on hierarchical clustering. One important thing to keep in mind is that it performs a gene-level z-score transformation before doing clustering: -```{r} +```{r clustering18} # use the same gene filter as in SC3 input <- logcounts(deng[rowData(deng)$sc3_gene_filter, ]) ``` -```{r, echo=TRUE, fig.height=7, fig.width=7} +```{r clustering19, echo=TRUE, fig.height=7, fig.width=7} # perform gene-by-gene per-sample z-score transformation dat <- apply(input, 1, function(y) scRNA.seq.funcs::z.transform.helper(y)) # hierarchical clustering @@ -159,7 +159,7 @@ hc <- hclust(dd, method = "average") ``` If the number of cluster is not known [SINCERA](https://research.cchmc.org/pbge/sincera.html) can identify __k__ as the minimum height of the hierarchical tree that generates no more than a specified number of singleton clusters (clusters containing only 1 cell) -```{r, echo=TRUE} +```{r clustering20, echo=TRUE} num.singleton <- 0 kk <- 1 for (i in 2:dim(dat)[2]) { @@ -176,7 +176,7 @@ cat(kk) ``` Let's now visualize the SINCERA results as a heatmap: -```{r clust-sincera, fig.cap = "Clustering solutions of SINCERA method using found $k$"} +```{r clustering21, fig.cap = "Clustering solutions of SINCERA method using found $k$"} pheatmap( t(dat), cluster_cols = hc, @@ -189,7 +189,7 @@ pheatmap( __Exercise 10__: Compare the results between `SINCERA` and the original publication cell types. __Our solution__: -```{r, echo=FALSE} +```{r clustering22, echo=FALSE} colData(deng)$SINCERA <- as.character(cutree(hc, k = kk)) adjustedRandIndex(colData(deng)$cell_type2, colData(deng)$SINCERA) ``` @@ -198,6 +198,6 @@ __Exercise 11__: Is using the singleton cluster criteria for finding __k__ a goo ### sessionInfo() -```{r echo=FALSE} +```{r clustering23, echo=FALSE} sessionInfo() ``` diff --git a/course_files/confounders-reads.Rmd b/course_files/confounders-reads.Rmd index 7e07ba607..6af56f443 100644 --- a/course_files/confounders-reads.Rmd +++ b/course_files/confounders-reads.Rmd @@ -4,17 +4,17 @@ knit: bookdown::preview_chapter ## Identifying confounding factors (Reads) -```{r, echo=FALSE, message=FALSE, warning=FALSE} +```{r confounders-read0, echo=FALSE, message=FALSE, warning=FALSE} library(scater, quietly = TRUE) library(knitr) options(stringsAsFactors = FALSE) -opts_chunk$set(out.width='90%', fig.align = 'center', echo=FALSE) +opts_chunk$set(cache = TRUE, out.width='90%', fig.align = 'center', echo=FALSE) reads <- readRDS("data/tung/reads.rds") reads.qc <- reads[rowData(reads)$use, colData(reads)$use] endog_genes <- !rowData(reads.qc)$is_feature_control ``` -```{r confound-pca-reads, fig.cap = "PCA plot of the tung data"} +```{r confounders-read1, fig.cap = "PCA plot of the tung data"} tmp <- runPCA( reads.qc[endog_genes, ], exprs_values = "logcounts_raw" @@ -26,7 +26,7 @@ plotPCA( ) ``` -```{r confound-find-pcs-total-features-reads, fig.cap = "PC correlation with the number of detected genes", fig.asp=1} +```{r confounders-read2, fig.cap = "PC correlation with the number of detected genes", fig.asp=1} logcounts(reads.qc) <- assay(reads.qc, "logcounts_raw") plotExplanatoryPCs( reads.qc[endog_genes, ], @@ -35,7 +35,7 @@ plotExplanatoryPCs( logcounts(reads.qc) <- NULL ``` -```{r confound-find-expl-vars-reads, fig.cap = "Explanatory variables"} +```{r confounders-read3, fig.cap = "Explanatory variables"} plotExplanatoryVariables( reads.qc[endog_genes, ], exprs_values = "logcounts_raw", @@ -50,6 +50,6 @@ plotExplanatoryVariables( ) ``` -```{r} +```{r confounders-read4} sessionInfo() ``` diff --git a/course_files/confounders.Rmd b/course_files/confounders.Rmd index c8ba38a23..0250e00d6 100644 --- a/course_files/confounders.Rmd +++ b/course_files/confounders.Rmd @@ -8,12 +8,12 @@ knit: bookdown::preview_chapter There is a large number of potential confounders, artifacts and biases in sc-RNA-seq data. One of the main challenges in analyzing scRNA-seq data stems from the fact that it is difficult to carry out a true technical replicate (why?) to distinguish biological and technical variability. In the previous chapters we considered batch effects and in this chapter we will continue to explore how experimental artifacts can be identified and removed. We will continue using the `scater` package since it provides a set of methods specifically for quality control of experimental and explanatory variables. Moreover, we will continue to work with the Blischak data that was used in the previous chapter. -```{r, echo=FALSE} +```{r confounders0, echo=FALSE} library(knitr) -opts_chunk$set(out.width='90%', fig.align = 'center') +opts_chunk$set(cache = TRUE, out.width='90%', fig.align = 'center') ``` -```{r, message=FALSE, warning=FALSE} +```{r confounders1, message=FALSE, warning=FALSE} library(scater, quietly = TRUE) options(stringsAsFactors = FALSE) umi <- readRDS("data/tung/umi.rds") @@ -26,7 +26,7 @@ The `umi.qc` dataset contains filtered cells and genes. Our next step is to expl ### Correlations with PCs Let's first look again at the PCA plot of the QCed dataset: -```{r confound-pca, fig.cap = "PCA plot of the tung data"} +```{r confounders2, fig.cap = "PCA plot of the tung data"} tmp <- runPCA( umi.qc[endog_genes, ], exprs_values = "logcounts_raw" @@ -44,7 +44,7 @@ Let's test whether some of the variables correlate with any of the PCs. #### Detected genes -```{r confound-find-pcs-total-features, fig.cap = "PC correlation with the number of detected genes", fig.asp=1} +```{r confounders3, fig.cap = "PC correlation with the number of detected genes", fig.asp=1} logcounts(umi.qc) <- assay(umi.qc, "logcounts_raw") plotExplanatoryPCs( umi.qc[endog_genes, ], @@ -59,7 +59,7 @@ Indeed, we can see that `PC1` can be almost completely explained by the number o `scater` can also compute the marginal $R^2$ for each variable when fitting a linear model regressing expression values for each gene against just that variable, and display a density plot of the gene-wise marginal $R^2$ values for the variables. -```{r confound-find-expl-vars, fig.cap = "Explanatory variables"} +```{r confounders4, fig.cap = "Explanatory variables"} plotExplanatoryVariables( umi.qc[endog_genes, ], exprs_values = "logcounts_raw", @@ -95,6 +95,6 @@ Perform the same analysis with read counts of the Blischak data. Use `tung/reads ### sessionInfo() -```{r echo=FALSE} +```{r confounders5, echo=FALSE} sessionInfo() ``` diff --git a/course_files/de-intro.Rmd b/course_files/de-intro.Rmd index d86325c46..81e872e54 100644 --- a/course_files/de-intro.Rmd +++ b/course_files/de-intro.Rmd @@ -4,9 +4,9 @@ knit: bookdown::preview_chapter ## Differential Expression (DE) analysis {#dechapter} -```{r, echo=FALSE} +```{r de-intro0, echo=FALSE} library(knitr) -opts_chunk$set(fig.align = "center") +opts_chunk$set(cache = TRUE, fig.align = "center") ``` ### Bulk RNA-seq @@ -47,7 +47,7 @@ Alternatively, we can use a non-parametric test which does not assume that expre The most common model of RNASeq data is the negative binomial model: -```{r nb-plot, fig.cap="Negative Binomial distribution of read counts for a single gene across 1000 cells"} +```{r de-intro1, fig.cap="Negative Binomial distribution of read counts for a single gene across 1000 cells"} set.seed(1) hist( rnbinom( @@ -69,7 +69,7 @@ It is parameterized by the mean expression (mu) and the dispersion (size), which However, a raw negative binomial model does not fit full-length transcript data as well due to the high dropout rates relative to the non-zero read counts. For this type of data a variety of zero-inflated negative binomial models have been proposed (e.g. [MAST](https://bioconductor.org/packages/release/bioc/html/MAST.html), [SCDE](https://bioconductor.org/packages/release/bioc/html/scde.html)). -```{r zero-inflation-plot, fig.cap="Zero-inflated Negative Binomial distribution"} +```{r de-intro2, fig.cap="Zero-inflated Negative Binomial distribution"} d <- 0.5; counts <- rnbinom( 1000, @@ -94,7 +94,7 @@ These models introduce a new parameter $d$, for the dropout rate, to the negativ Finally, several methods use a Poisson-Beta distribution which is based on a mechanistic model of transcriptional bursting. There is strong experimental support for this model ([Kim and Marioni, 2013](https://genomebiology.biomedcentral.com/articles/10.1186/gb-2013-14-1-r7)) and it provides a good fit to scRNA-seq data but it is less easy to use than the negative-binomial models and much less existing methods upon which to build than the negative binomial model. -```{r pois-beta-plot, fit.cap="Poisson-Beta distribution"} +```{r de-intro3, pois-beta-plot, fit.cap="Poisson-Beta distribution"} a <- 0.1 b <- 0.1 g <- 100 diff --git a/course_files/de-real.Rmd b/course_files/de-real.Rmd index 906799f24..f027fbcaa 100644 --- a/course_files/de-real.Rmd +++ b/course_files/de-real.Rmd @@ -5,11 +5,11 @@ output: html_document ## DE in a real dataset -```{r, echo=FALSE} +```{r de-real0, echo=FALSE} library(knitr) -opts_chunk$set(fig.align = "center") +opts_chunk$set(cache = TRUE, fig.align = "center") ``` -```{r, echo=TRUE, message=FALSE, warning=FALSE} +```{r de-real1, echo=TRUE, message=FALSE, warning=FALSE} library(scRNA.seq.funcs) library(edgeR) library(monocle) @@ -25,7 +25,7 @@ For this experiment bulk RNA-seq data for each cell-line was generated in additi differentially expressed genes identified using standard methods on the respective bulk data as the ground truth for evaluating the accuracy of each single-cell method. To save time we have pre-computed these for you. You can run the commands below to load these data. -```{r} +```{r de-real2} DE <- read.table("data/tung/TPs.txt") notDE <- read.table("data/tung/TNs.txt") GroundTruth <- list( @@ -36,7 +36,7 @@ GroundTruth <- list( This ground truth has been produce for the comparison of individual NA19101 to NA19239. Now load the respective single-cell data: -```{r} +```{r de-real3} molecules <- read.table("data/tung/molecules.txt", sep = "\t") anno <- read.table("data/tung/annotation.txt", sep = "\t", header = TRUE) keep <- anno[,1] == "NA19101" | anno[,1] == "NA19239" @@ -62,13 +62,13 @@ ones. The most commonly used non-parametric test is the The KS-test quantifies the distance between the empirical cummulative distributions of the expression of each gene in each of the two populations. It is sensitive to changes in mean experession and changes in variability. However it assumes data is continuous and may perform poorly when data contains a large number of identical values (eg. zeros). Another issue with the KS-test is that it can be very sensitive for large sample sizes and thus it may end up as significant even though the magnitude of the difference is very small. -```{r ks-statistic, echo=FALSE, out.width = '60%', fig.cap="Illustration of the two-sample Kolmogorov–Smirnov statistic. Red and blue lines each correspond to an empirical distribution function, and the black arrow is the two-sample KS statistic. (taken from [here](https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test))"} +```{r de-real4, echo=FALSE, out.width = '60%', fig.cap="Illustration of the two-sample Kolmogorov–Smirnov statistic. Red and blue lines each correspond to an empirical distribution function, and the black arrow is the two-sample KS statistic. (taken from [here](https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test))"} knitr::include_graphics("figures/KS2_Example.png") ``` Now run the test: -```{r, message=FALSE, warning=FALSE} +```{r de-real5, message=FALSE, warning=FALSE} pVals <- apply( norm, 1, function(x) { ks.test( @@ -84,7 +84,7 @@ This code "applies" the function to each row (specified by 1) of the expression #### Evaluating Accuracy -```{r} +```{r de-real6} sigDE <- names(pVals)[pVals < 0.05] length(sigDE) # Number of KS-DE genes @@ -96,7 +96,7 @@ sum(GroundTruth$notDE %in% sigDE) As you can see many more of our ground truth negative genes were identified as DE by the KS-test (false positives) than ground truth positive genes (true positives), however this may be due to the larger number of notDE genes thus we typically normalize these counts as the True positive rate (TPR), TP/(TP + FN), and False positive rate (FPR), FP/(FP+TP). -```{r} +```{r de-real7} tp <- sum(GroundTruth$DE %in% sigDE) fp <- sum(GroundTruth$notDE %in% sigDE) tn <- sum(GroundTruth$notDE %in% names(pVals)[pVals >= 0.05]) @@ -109,7 +109,7 @@ Now we can see the TPR is much higher than the FPR indicating the KS test is ide So far we've only evaluated the performance at a single significance threshold. Often it is informative to vary the threshold and evaluate performance across a range of values. This is then plotted as a receiver-operating-characteristic curve (ROC) and a general accuracy statistic can be calculated as the area under this curve (AUC). We will use the ROCR package to facilitate this plotting. -```{r ks-roc-plot, fig.cap="ROC curve for KS-test."} +```{r de-real8, fig.cap="ROC curve for KS-test."} # Only consider genes for which we know the ground truth pVals <- pVals[names(pVals) %in% GroundTruth$DE | names(pVals) %in% GroundTruth$notDE] @@ -123,7 +123,7 @@ aucObj@y.values[[1]] # AUC ``` Finally to facilitate the comparisons of other DE methods let's put this code into a function so we don't need to repeat it: -```{r} +```{r de-real9} DE_Quality_AUC <- function(pVals) { pVals <- pVals[names(pVals) %in% GroundTruth$DE | names(pVals) %in% GroundTruth$notDE] @@ -140,7 +140,7 @@ DE_Quality_AUC <- function(pVals) { The Wilcox-rank-sum test is another non-parametric test, but tests specifically if values in one group are greater/less than the values in the other group. Thus it is often considered a test for difference in median expression between two groups; whereas the KS-test is sensitive to any change in distribution of expression values. -```{r wilcox-plot, fig.cap="ROC curve for Wilcox test.", message=FALSE, warning=FALSE} +```{r de-real10, fig.cap="ROC curve for Wilcox test.", message=FALSE, warning=FALSE} pVals <- apply( norm, 1, function(x) { wilcox.test( @@ -158,7 +158,7 @@ DE_Quality_AUC(pVals) We've already used edgeR for differential expression in Chapter \@ref(dealing-with-confounders). edgeR is based on a negative binomial model of gene expression and uses a generalized linear model (GLM) framework, the enables us to include other factors such as batch to the model. -```{r edger-plot, fig.cap="ROC curve for edgeR.", message=FALSE} +```{r de-real11, fig.cap="ROC curve for edgeR.", message=FALSE} dge <- DGEList( counts = counts, norm.factors = rep(1, length(counts[1,])), @@ -180,7 +180,7 @@ DE_Quality_AUC(pVals) [Monocle](https://bioconductor.org/packages/release/bioc/html/monocle.html) can use several different models for DE. For count data it recommends the Negative Binomial model (negbinomial.size). For normalized data it recommends log-transforming it then using a normal distribution (gaussianff). Similar to edgeR this method uses a GLM framework so in theory can account for batches, however in practice the model fails for this dataset if batches are included. -```{r Monocle-plot, fig.cap="ROC curve for Monocle.", message=FALSE, warning=FALSE} +```{r de-real12, fig.cap="ROC curve for Monocle.", message=FALSE, warning=FALSE} pd <- data.frame(group = group, batch = batch) rownames(pd) <- colnames(counts) pd <- new("AnnotatedDataFrame", data = pd) @@ -202,7 +202,7 @@ DE_Quality_AUC(pVals) __Exercise__: Compare the results using the negative binomial model on counts and those from using the normal/gaussian model (`gaussianff()`) on log-transformed normalized counts. __Answer__: -```{r Monocle-plot2, fig.cap="ROC curve for Monocle-gaussian.", message=FALSE, echo=FALSE, warning=FALSE} +```{r de-real13, fig.cap="ROC curve for Monocle-gaussian.", message=FALSE, echo=FALSE, warning=FALSE} pd <- data.frame(group = group, batch = batch) rownames(pd) <- colnames(norm) pd <- new("AnnotatedDataFrame", data = pd) @@ -226,7 +226,7 @@ DE_Quality_AUC(pVals) [MAST](https://bioconductor.org/packages/release/bioc/html/MAST.html) is based on a zero-inflated negative binomial model. It tests for differential expression using a hurdle model to combine tests of discrete (0 vs not zero) and continuous (non-zero values) aspects of gene expression. Again this uses a linear modelling framework to enable complex models to be considered. -```{r MAST-plot, fig.cap="ROC curve for MAST.", message=FALSE} +```{r de-real14, fig.cap="ROC curve for MAST.", message=FALSE} log_counts <- log(counts + 1) / log(2) fData <- data.frame(names = rownames(log_counts)) rownames(fData) <- rownames(log_counts); @@ -258,7 +258,7 @@ These methods are too slow to run today but we encourage you to try them out on [BPSC](https://academic.oup.com/bioinformatics/article/32/14/2128/2288270/Beta-Poisson-model-for-single-cell-RNA-seq-data) uses the Poisson-Beta model of single-cell gene expression, which we discussed in the previous chapter, and combines it with generalized linear models which we've already encountered when using edgeR. BPSC performs comparisons of one or more groups to a reference group ("control") and can include other factors such as batches in the model. -```{r, message=FALSE, eval=FALSE} +```{r de-real15, message=FALSE, eval=FALSE} library(BPSC) bpsc_data <- norm[,batch=="NA19101.r1" | batch=="NA19239.r1"] bpsc_group = group[batch=="NA19101.r1" | batch=="NA19239.r1"] @@ -276,7 +276,7 @@ DE_Quality_AUC(pVals) ### SCDE [SCDE](http://hms-dbmi.github.io/scde/) is the first single-cell specific DE method. It fits a zero-inflated negative binomial model to expression data using Bayesian statistics. The usage below tests for differences in mean expression of individual genes across groups but recent versions include methods to test for differences in mean expression or dispersion of groups of genes, usually representing a pathway. -```{r, eval=FALSE} +```{r de-real16, eval=FALSE} library(scde) cnts <- apply( counts, @@ -320,6 +320,6 @@ DE_Quality_AUC(pVals) ### sessionInfo() -```{r echo=FALSE} +```{r de-real17, echo=FALSE} sessionInfo() ``` diff --git a/course_files/dropouts.Rmd b/course_files/dropouts.Rmd index 98ed7ddf3..ce157f733 100644 --- a/course_files/dropouts.Rmd +++ b/course_files/dropouts.Rmd @@ -5,11 +5,11 @@ output: html_document ## Feature Selection -```{r, echo=FALSE} +```{r dropout0, echo=FALSE} library(knitr) -opts_chunk$set(fig.align = "center") +opts_chunk$set(cache = TRUE, fig.align = "center") ``` -```{r, echo=TRUE, message=FALSE, warning=FALSE} +```{r dropout1, echo=TRUE, message=FALSE, warning=FALSE} library(scRNA.seq.funcs) library(matrixStats) library(M3Drop) @@ -43,7 +43,7 @@ are expressed at different levels across groups. For this section we will continue working with the Deng data. -```{r} +```{r dropout2} deng <- readRDS("data/deng/deng-reads.rds") celltype_labs <- colData(deng)$cell_type2 cell_colors <- brewer.pal(max(3,length(unique(celltype_labs))), "Set3") @@ -60,7 +60,7 @@ M3Drop feature selection is runs direction on a normalized (but not log-transfor expression matrix. This can be extracted from our SingleCellExperiment object using the command below. -```{r} +```{r dropout3} expr_matrix <- M3Drop::M3DropConvertData(deng) ``` @@ -69,12 +69,12 @@ scater, SingleCellExperiment, monocle, and Seurat. It can also convert an existi expression matrix to the correct form (removing undetected genes & normalizing/delogging) if you specify whether the matrix is raw counts, or log transformed. Check the manual for details: -```{r eval=FALSE} +```{r dropout4, eval=FALSE} ?M3Drop::M3DropConvertData ``` __Exercise 1__: Confirm that the conversion function has removed undetected genes: -```{r, include=FALSE} +```{r dropout5, include=FALSE} nrow(counts(deng)) - nrow(expr_matrix) summary( rowSums(counts(deng))[! rownames(counts(deng)) %in% rownames(expr_matrix) ] ) ``` @@ -105,7 +105,7 @@ __Exercise 2__ Using the functions rowMeans and rowVars to plot the relationship between mean expression and variance for all genes in this dataset. (Hint: use log="xy" to plot on a log-scale). -```{r, echo=FALSE, fig.width = 8.5, fig.height = 6} +```{r dropout6, echo=FALSE, fig.width = 8.5, fig.height = 6} plot( rowMeans(expr_matrix), rowVars(expr_matrix), @@ -131,7 +131,7 @@ is the fitted technical noise model and the dashed line is the 95% CI. Pink dots are the genes with significant biological variability after multiple-testing correction. -```{r, fig.width = 7, fig.height = 6} +```{r dropout7, fig.width = 7, fig.height = 6} Brennecke_HVG <- BrenneckeGetVariableGenes( expr_matrix, fdr = 0.01, @@ -143,14 +143,14 @@ This function returns a matrix of significant genes as well as their estimated e between observed and expected coefficient of variation), and their significance as raw p.values and FDR corrected q.values. For now we will just keep the names of the significant HVG genes. -```{r} +```{r dropout8} HVG_genes <- Brennecke_HVG$Gene ``` __Exercise 3__ How many genes were signifcant using BrenneckeGetVariableGenes? -```{r, echo=FALSE, fig.width = 8.5, fig.height = 6} +```{r dropout9, echo=FALSE, fig.width = 8.5, fig.height = 6} length(HVG_genes) ``` #### High Dropout Genes @@ -171,7 +171,7 @@ Because the Michaelis-Menten equation is a convex non-linear function, genes whi differentially expression across two or more populations of cells in our dataset will be shifted up/right of the Michaelis-Menten model (see Figure below). -```{r, fig.width = 8.5, fig.height = 6, echo=TRUE} +```{r dropout10, fig.width = 8.5, fig.height = 6, echo=TRUE} K <- 49 S_sim <- 10^seq(from = -3, to = 4, by = 0.05) # range of expression values MM <- 1 - S_sim / (K + S_sim) @@ -208,7 +208,7 @@ __Note__: add `log="x"` to the `plot` call above to see how this looks on the lo __Exercise 4__: Produce the same plot as above with different expression levels (S1 & S2) and/or mixtures (mix). -```{r, include=FALSE} +```{r dropout11, include=FALSE} plot( S_sim, MM, @@ -242,7 +242,7 @@ points( We use M3Drop to identify significant outliers to the right of the MM curve. We also apply 1% FDR multiple testing correction: -```{r, fig.width = 7, fig.height = 6} +```{r dropout12, fig.width = 7, fig.height = 6} M3Drop_genes <- M3DropFeatureSelection( expr_matrix, mt_method = "fdr", @@ -271,7 +271,7 @@ the Deng data is not UMI counts the model does not fit the noise sufficiently an far too many genes will be called as significant. Thus we will take the top 1500 by effect size. -```{r, fig.width=8, fig.height=5} +```{r dropout13, fig.width=8, fig.height=5} deng_int <- NBumiConvertData(deng) DANB_fit <- NBumiFitModel(deng_int) # DANB is fit to the raw count matrix # Perform DANB feature selection @@ -281,7 +281,7 @@ DANB_genes <- DropFS[1:1500,]$Gene __Exercise 5__ How many genes were signifcant using NBumiFeatureSelectionCombinedDrop? -```{r, echo=FALSE, fig.width = 8.5, fig.height = 6} +```{r dropout14, echo=FALSE, fig.width = 8.5, fig.height = 6} nrow(DropFS) ``` @@ -301,7 +301,7 @@ result it is more appropriate to take the top few thousand genes as ranked by ge consider the significance of the correlations. -```{r, eval=FALSE} +```{r dropout15, eval=FALSE} cor_feat <- M3Drop::corFS(expr_matrix) Cor_genes <- names(cor_feat)[1:1500] ``` @@ -312,7 +312,7 @@ may be relevant to the underlying biology. However, as with gene-gene correlatio be susceptible to detecting systematic variation due to batch effects; thus it is recommended to plot the PCA results to determine those components corresponding to the biological variation rather than batch effects. -```{r, fig.width=7, fig.height=6} +```{r dropout16, fig.width=7, fig.height=6} # PCA is typically performed on log-transformed expression data pca <- prcomp(log(expr_matrix + 1) / log(2)) @@ -332,7 +332,7 @@ PCA_genes <- names(score[1:1500]) __Exercise 6__ Consider the top 5 principal components. Which appear to be most biologically relevant? How does the top 1,500 features change if you consider the loadings for those components? -```{r, include=FALSE} +```{r dropout17, include=FALSE} plot( pca$rotation[,2], pca$rotation[,3], @@ -356,7 +356,7 @@ PCA_genes2 = names(score[1:1500]) We can check whether the identified features really do represent genes differentially expressed between cell-types in this dataset. -```{r, fig.width = 7, fig.height = 10} +```{r dropout18, fig.width = 7, fig.height = 10} M3DropExpressionHeatmap( M3Drop_genes, expr_matrix, @@ -365,7 +365,7 @@ M3DropExpressionHeatmap( ``` We can also consider how consistent each feature selection method is with the others using the Jaccard Index: -```{r} +```{r dropout19} J <- sum(M3Drop_genes %in% HVG_genes)/length(unique(c(M3Drop_genes, HVG_genes))) ``` @@ -373,7 +373,7 @@ __Exercise 7__ Plot the expression of the features for each of the other methods. Which appear to be differentially expressed? How consistent are the different methods for this dataset? -```{r, eval=FALSE, include=FALSE, fig.width = 7, fig.height = 10} +```{r dropout20, eval=FALSE, include=FALSE, fig.width = 7, fig.height = 10} M3DropExpressionHeatmap( HVG_genes, expr_matrix, @@ -381,7 +381,7 @@ M3DropExpressionHeatmap( ) ``` -```{r, eval=FALSE, include=FALSE, fig.width = 7, fig.height = 10} +```{r dropout21, eval=FALSE, include=FALSE, fig.width = 7, fig.height = 10} M3DropExpressionHeatmap( Cor_genes, expr_matrix, @@ -389,7 +389,7 @@ M3DropExpressionHeatmap( ) ``` -```{r, eval=FALSE, include=FALSE, fig.width = 7, fig.height = 10} +```{r dropout22, eval=FALSE, include=FALSE, fig.width = 7, fig.height = 10} M3DropExpressionHeatmap( PCA_genes, expr_matrix, @@ -397,7 +397,7 @@ M3DropExpressionHeatmap( ) ``` -```{r, eval=FALSE, include=FALSE, fig.width = 7, fig.height = 10} +```{r dropout23, eval=FALSE, include=FALSE, fig.width = 7, fig.height = 10} M3DropExpressionHeatmap( PCA_genes2, expr_matrix, @@ -405,7 +405,7 @@ M3DropExpressionHeatmap( ) ``` -```{r, eval=FALSE, include=FALSE} +```{r dropout24, eval=FALSE, include=FALSE} list_of_features <- list( M3Drop_genes, HVG_genes, @@ -430,6 +430,6 @@ colnames(Out) <- rownames(Out) <- c("M3Drop", "HVG", "Cor", "PCA", "PCA2") ### sessionInfo() -```{r echo=FALSE} +```{r dropout25, echo=FALSE} sessionInfo() ``` diff --git a/course_files/exp-methods.Rmd b/course_files/exp-methods.Rmd index 7078c4e98..293068af8 100644 --- a/course_files/exp-methods.Rmd +++ b/course_files/exp-methods.Rmd @@ -2,14 +2,14 @@ output: html_document --- -```{r exp-methods0, echo=FALSE, cache=TRUE, cache.extra = list(R.version, sessionInfo())} +```{r Exp-methods0, echo=FALSE} library(knitr) -opts_chunk$set(fig.align = "center", echo=FALSE, out.width = '70%', cache=TRUE, cache.extra = list(R.version, sessionInfo())) +opts_chunk$set(cache = TRUE, fig.align = "center", echo=FALSE, out.width = '70%') ``` ## Experimental methods -```{r exp-methods1, fig.cap="Moore's law in single cell transcriptomics (image taken from [Svensson et al](https://arxiv.org/abs/1704.01379))", out.width = '100%'} +```{r Exp-methods1, fig.cap="Moore's law in single cell transcriptomics (image taken from [Svensson et al](https://arxiv.org/abs/1704.01379))", out.width = '100%'} knitr::include_graphics("figures/moores-law.png") ``` @@ -33,19 +33,19 @@ For quantification, there are two types, __full-length__ and __tag-based__. The The strategy used for capture determines throughput, how the cells can be selected as well as what kind of additional information besides the sequencing that can be obtained. The three most widely used options are __microwell-__, __microfluidic-__ and __droplet-__ based. -```{r exp-methods2, fig.cap="Image of microwell plates (image taken from Wikipedia)"} +```{r Exp-methods2, fig.cap="Image of microwell plates (image taken from Wikipedia)"} knitr::include_graphics("figures/300px-Microplates.jpg") ``` For well-based platforms, cells are isolated using for example pipette or laser capture and placed in microfluidic wells. One advantage of well-based methods is that they can be combined with fluorescent activated cell sorting (FACS), making it possible to select cells based on surface markers. This strategy is thus very useful for situations when one wants to isolate a specific subset of cells for sequencing. Another advantage is that one can take pictures of the cells. The image provides an additional modality and a particularly useful application is to identify wells containg damaged cells or doublets. The main drawback of these methods is that they are often low-throughput and the amount of work required per cell may be considerable. -```{r exp-methods3, fig.cap="Image of a 96-well Fluidigm C1 chip (image taken from Fluidigm)"} +```{r Exp-methods3, fig.cap="Image of a 96-well Fluidigm C1 chip (image taken from Fluidigm)"} knitr::include_graphics("figures/fluidigmC1.jpg") ``` Microfluidic platforms, such as [Fluidigm's C1](https://www.fluidigm.com/products/c1-system#workflow), provide a more integrated system for capturing cells and for carrying out the reactions necessary for the library preparations. Thus, they provide a higher throughput than microwell based platforms. Typically, only around 10% of cells are captured in a microfluidic platform and thus they are not appropriate if one is dealing with rare cell-types or very small amounts of input. Moreover, the chip is relatively expensive, but since reactions can be carried out in a smaller volume money can be saved on reagents. -```{r exp-methods4, out.width = '60%', fig.cap="Schematic overview of the drop-seq method (Image taken from Macosko et al)"} +```{r Exp-methods4, out.width = '60%', fig.cap="Schematic overview of the drop-seq method (Image taken from Macosko et al)"} knitr::include_graphics("figures/drop-seq.png") ``` @@ -59,13 +59,13 @@ Clearly, full-length transcript quantification will be more appropriate if one i Two recent studies from the Enard group [@Ziegenhain2017-cu] and the Teichmann group [@Svensson2017-op] have compared several different protocols. In their study, Ziegenhain et al compared five different protocols on the same sample of mouse embryonic stem cells (mESCs). By controlling for the number of cells as well as the sequencing depth, the authors were able to directly compare the sensitivity, noise-levels and costs of the different protocols. One example of their conclusions is illustrated in the figure below which shows the number of genes detected (for a given detection threshold) for the different methods. As you can see, there is almost a two-fold difference between drop-seq and Smart-seq2, suggesting that the choice of protocol can have a major impact on the study -```{r exp-methods5, out.width = '60%', fig.cap="Enard group study"} +```{r Exp-methods5, out.width = '60%', fig.cap="Enard group study"} knitr::include_graphics("figures/ziegenhainEnardFig1.png") ``` Svensson et al take a different approach by using synthetic transcripts (spike-ins, more about these later) with known concentrations to measure the accuracy and sensitivity of different protocols. Comparing a wide range of studies, they also reported substantial differences between the protocols. -```{r exp-methods6, out.width = '100%', fig.cap="Teichmann group study"} +```{r Exp-methods6, out.width = '100%', fig.cap="Teichmann group study"} knitr::include_graphics("figures/svenssonTeichmannFig2.png") ``` diff --git a/course_files/exprs-constr.Rmd b/course_files/exprs-constr.Rmd index 047f8459b..9e01c4d68 100644 --- a/course_files/exprs-constr.Rmd +++ b/course_files/exprs-constr.Rmd @@ -4,9 +4,9 @@ output: html_document # Construction of expression matrix -```{r exprs-constr0, echo=FALSE, cache=TRUE, cache.extra = list(R.version, sessionInfo())} +```{r Exprs-constr0, echo=FALSE} library(knitr) -opts_chunk$set(fig.align = "center", echo=FALSE, cache=TRUE, cache.extra = list(R.version, sessionInfo())) +opts_chunk$set(cache = TRUE, fig.align = "center", echo=FALSE) ``` Many analyses of scRNA-seq data take as their starting point an __expression matrix__. By convention, the each row of the expression matrix represents a gene and each column represents a cell (although some authors use the transpose). Each entry represents the expression level of a particular gene in a given cell. The units by which the expression is meassured depends on the protocol and the normalization strategy used. @@ -22,7 +22,7 @@ $/fastQC experiment.bam Below is an example of the output from FastQC for a dataset of 125 bp reads. The plot reveals a technical error which resulted in a couple of bases failing to be read correctly in the centre of the read. However, since the rest of the read was of high quality this error will most likely have a negligible effect on mapping efficiency. -```{r exprs-constr1, out.width = '90%', fig.cap="Example of FastQC output"} +```{r Exprs-constr1, out.width = '90%', fig.cap="Example of FastQC output"} knitr::include_graphics("figures/per_base_quality.png") ``` @@ -73,7 +73,7 @@ analysis. The two yellow arrows point to cells with a surprisingly large number of unmapped reads. In this example we kept the cells during the alignment QC step, but they were later removed during cell QC due to a high proportion of ribosomal RNA reads. -```{r exprs-constr2, out.width = '90%', fig.cap="Example of the total number of reads mapped to each cell."} +```{r Exprs-constr2, out.width = '90%', fig.cap="Example of the total number of reads mapped to each cell."} knitr::include_graphics("figures/Bergiers_exp1_mapping_by_cell.png") ``` @@ -89,7 +89,7 @@ python /split_bam.py -i input.bam -r rRNAmask.bed -o output.txt However the expected results will depend on the experimental protocol, e.g. many scRNA-seq methods use poly-A selection to avoid sequencing rRNAs which results in a 3' bias in the read coverage across the genes (aka gene body coverage). The figure below shows this 3' bias as well as three cells which were outliers and removed from the dataset: -```{r exprs-constr3, out.width = '90%', fig.cap="Example of the 3' bias in the read coverage."} +```{r Exprs-constr3, out.width = '90%', fig.cap="Example of the 3' bias in the read coverage."} knitr::include_graphics("figures/Exp1_RSEQC_geneBodyCoverage_plot_Combined.png") ``` diff --git a/course_files/exprs-norm-reads.Rmd b/course_files/exprs-norm-reads.Rmd index 085566999..807532130 100644 --- a/course_files/exprs-norm-reads.Rmd +++ b/course_files/exprs-norm-reads.Rmd @@ -4,20 +4,20 @@ output: html_document ## Normalization practice (Reads) -```{r, echo=FALSE, message=FALSE, warning=FALSE} +```{r exprs-norm-read0, echo=FALSE, message=FALSE, warning=FALSE} library(scRNA.seq.funcs) library(scater) library(scran) options(stringsAsFactors = FALSE) set.seed(1234567) library(knitr) -opts_chunk$set(out.width='90%', fig.align = 'center', echo=FALSE) +opts_chunk$set(cache= TRUE, out.width='90%', fig.align = 'center', echo=FALSE) reads <- readRDS("data/tung/reads.rds") reads.qc <- reads[rowData(reads)$use, colData(reads)$use] endog_genes <- !rowData(reads.qc)$is_feature_control ``` -```{r norm-pca-raw-reads, fig.cap = "PCA plot of the tung data"} +```{r exprs-norm-read1, fig.cap = "PCA plot of the tung data"} tmp <- runPCA( reads.qc[endog_genes, ], exprs_values = "logcounts_raw" @@ -30,7 +30,7 @@ plotPCA( ) ``` -```{r norm-pca-cpm-reads, fig.cap = "PCA plot of the tung data after CPM normalisation"} +```{r exprs-norm-read2, fig.cap = "PCA plot of the tung data after CPM normalisation"} logcounts(reads.qc) <- log2(calculateCPM(reads.qc, use_size_factors = FALSE) + 1) plotPCA( reads.qc[endog_genes, ], @@ -39,7 +39,7 @@ plotPCA( shape_by = "individual" ) ``` -```{r norm-ours-rle-cpm-reads, fig.cap = "Cell-wise RLE of the tung data", warning=FALSE} +```{r exprs-norm-read3, fig.cap = "Cell-wise RLE of the tung data", warning=FALSE} plotRLE( reads.qc[endog_genes, ], exprs_values = "logcounts_raw", @@ -52,7 +52,7 @@ plotRLE( ) ``` -```{r norm-pca-lsf-umi, fig.cap = "PCA plot of the tung data after LSF normalisation"} +```{r exprs-norm-read4, fig.cap = "PCA plot of the tung data after LSF normalisation"} qclust <- quickCluster(reads.qc, min.size = 30) reads.qc <- computeSumFactors(reads.qc, sizes = 15, clusters = qclust) reads.qc <- normalize(reads.qc) @@ -64,7 +64,7 @@ plotPCA( ) ``` -```{r norm-ours-rle-scran-reads, fig.cap = "Cell-wise RLE of the tung data"} +```{r exprs-norm-read5, fig.cap = "Cell-wise RLE of the tung data"} plotRLE( reads.qc[endog_genes, ], exprs_values = "logcounts_raw", @@ -77,7 +77,7 @@ plotRLE( ) ``` -```{r norm-pca-downsample-reads, fig.cap = "PCA plot of the tung data after downsampling"} +```{r exprs-norm-read6, fig.cap = "PCA plot of the tung data after downsampling"} logcounts(reads.qc) <- log2(Down_Sample_Matrix(counts(reads.qc)) + 1) plotPCA( reads.qc[endog_genes, ], @@ -86,7 +86,7 @@ plotPCA( shape_by = "individual" ) ``` -```{r norm-ours-rle-downsample-reads, fig.cap = "Cell-wise RLE of the tung data"} +```{r exprs-norm-read7, fig.cap = "Cell-wise RLE of the tung data"} plotRLE( reads.qc[endog_genes, ], exprs_values = "logcounts_raw", @@ -99,7 +99,7 @@ plotRLE( ) ``` -```{r} +```{r exprs-norm-read8} reads.qc <- getBMFeatureAnnos( reads.qc, filters = "ensembl_gene_id", @@ -116,20 +116,20 @@ reads.qc <- getBMFeatureAnnos( ) ``` -```{r} +```{r exprs-norm-read9} reads.qc.ann <- reads.qc[!is.na(rowData(reads.qc)$ensembl_gene_id), ] ``` -```{r} +```{r exprs-norm-read10} eff_length <- abs(rowData(reads.qc.ann)$end_position - rowData(reads.qc.ann)$start_position) / 1000 ``` -```{r} +```{r exprs-norm-read11} tpm(reads.qc.ann) <- log2(calculateTPM(reads.qc.ann, eff_length) + 1) ``` -```{r norm-pca-tpm-reads, fig.cap = "PCA plot of the tung data after TPM normalisation"} +```{r exprs-norm-read12, fig.cap = "PCA plot of the tung data after TPM normalisation"} tmp <- runPCA( reads.qc.ann, exprs_values = "tpm", @@ -142,11 +142,11 @@ plotPCA( ) ``` -```{r} +```{r exprs-norm-read13} tpm(reads.qc.ann) <- log2(calculateFPKM(reads.qc.ann, eff_length) + 1) ``` -```{r norm-pca-fpkm-reads, fig.cap = "PCA plot of the tung data after FPKM normalisation", eval=FALSE} +```{r exprs-norm-read14, fig.cap = "PCA plot of the tung data after FPKM normalisation", eval=FALSE} tmp <- runPCA( reads.qc.ann, exprs_values = "tpm", @@ -159,7 +159,7 @@ plotPCA( ) ``` -```{r} +```{r exprs-norm-read15} sessionInfo() ``` diff --git a/course_files/exprs-norm.Rmd b/course_files/exprs-norm.Rmd index db2dde91c..81c8b4f1b 100644 --- a/course_files/exprs-norm.Rmd +++ b/course_files/exprs-norm.Rmd @@ -6,10 +6,10 @@ output: html_document ### Introduction -```{r, echo=FALSE} +```{r exprs-norm0, echo=FALSE} library(scRNA.seq.funcs) library(knitr) -opts_chunk$set(out.width='90%', fig.align = 'center') +opts_chunk$set(cache = TRUE, out.width='90%', fig.align = 'center') insert_fun <- function(name) { read_chunk(lines = capture.output(dump(name, '')), labels = paste(name, 'source', sep = '-')) } @@ -42,7 +42,7 @@ million (__CPM__) by dividing each column by its total then multiplying by calculation of total expression in order to correct for total cell RNA content, therefore we will only use endogenous genes. Example of a __CPM__ function in `R`: -```{r calc_cpm-source, eval=FALSE} +```{r exprs-norm1, eval=FALSE} ``` @@ -56,7 +56,7 @@ To deal with this potentiality several other measures were devised. The __size factor (SF)__ was proposed and popularized by DESeq [@Anders2010-jr]. First the geometric mean of each gene across all cells is calculated. The size factor for each cell is the median across genes of the ratio of the expression to the gene's geometric mean. A drawback to this method is that since it uses the geometric mean only genes with non-zero expression values across all cells can be used in its calculation, making it unadvisable for large low-depth scRNASeq experiments. `edgeR` & `scater` call this method __RLE__ for "relative log expression". Example of a __SF__ function in `R`: -```{r calc_sf-source, eval=FALSE} +```{r exprs-norm2, eval=FALSE} ``` @@ -64,7 +64,7 @@ The __size factor (SF)__ was proposed and popularized by DESeq [@Anders2010-jr]. The __upperquartile (UQ)__ was proposed by [@Bullard2010-eb]. Here each column is divided by the 75% quantile of the counts for each library. Often the calculated quantile is scaled by the median across cells to keep the absolute level of expression relatively consistent. A drawback to this method is that for low-depth scRNASeq experiments the large number of undetected genes may result in the 75% quantile being zero (or close to it). This limitation can be overcome by generalizing the idea and using a higher quantile (eg. the 99% quantile is the default in scater) or by excluding zeros prior to calculating the 75% quantile. Example of a __UQ__ function in `R`: -```{r calc_uq-source, eval=FALSE} +```{r exprs-norm3, eval=FALSE} ``` @@ -80,7 +80,7 @@ Another method is called __TMM__ is the weighted trimmed mean of M-values (to th A final way to correct for library size is to downsample the expression matrix so that each cell has approximately the same total number of molecules. The benefit of this method is that zero values will be introduced by the down sampling thus eliminating any biases due to differing numbers of detected genes. However, the major drawback is that the process is not deterministic so each time the downsampling is run the resulting expression matrix is slightly different. Thus, often analyses must be run on multiple downsamplings to ensure results are robust. Example of a __downsampling__ function in `R`: -```{r Down_Sample_Matrix-source, eval=FALSE} +```{r exprs-norm4, eval=FALSE} ``` @@ -88,7 +88,7 @@ A final way to correct for library size is to downsample the expression matrix s to compare the efficiency of different normalization methods we will use visual inspection of `PCA` plots and calculation of cell-wise _relative log expression_ via `scater`'s `plotRLE()` function. Namely, cells with many (few) reads have higher (lower) than median expression for most genes resulting in a positive (negative) _RLE_ across the cell, whereas normalized cells have an _RLE_ close to zero. Example of a _RLE_ function in `R`: -```{r calc_cell_RLE-source, eval=FALSE} +```{r exprs-norm5, eval=FALSE} ``` @@ -104,7 +104,7 @@ __Note__ For __CPM__ normalisation we use `scater`'s `calculateCPM()` function. We will continue to work with the `tung` data that was used in the previous chapter. -```{r, message=FALSE, warning=FALSE} +```{r exprs-norm6, message=FALSE, warning=FALSE} library(scRNA.seq.funcs) library(scater) library(scran) @@ -116,7 +116,7 @@ endog_genes <- !rowData(umi.qc)$is_feature_control ``` ### Raw -```{r norm-pca-raw, fig.cap = "PCA plot of the tung data"} +```{r exprs-norm7, fig.cap = "PCA plot of the tung data"} tmp <- runPCA( umi.qc[endog_genes, ], exprs_values = "logcounts_raw" @@ -130,7 +130,7 @@ plotPCA( ``` ### CPM -```{r norm-pca-cpm, fig.cap = "PCA plot of the tung data after CPM normalisation"} +```{r exprs-norm8, fig.cap = "PCA plot of the tung data after CPM normalisation"} logcounts(umi.qc) <- log2(calculateCPM(umi.qc, use_size_factors = FALSE) + 1) plotPCA( umi.qc[endog_genes, ], @@ -139,7 +139,7 @@ plotPCA( shape_by = "individual" ) ``` -```{r norm-ours-rle-cpm, fig.cap = "Cell-wise RLE of the tung data"} +```{r exprs-norm9, fig.cap = "Cell-wise RLE of the tung data"} plotRLE( umi.qc[endog_genes, ], exprs_values = "logcounts_raw", @@ -153,7 +153,7 @@ plotRLE( ``` ### scran -```{r norm-pca-lsf, fig.cap = "PCA plot of the tung data after LSF normalisation"} +```{r exprs-norm10, fig.cap = "PCA plot of the tung data after LSF normalisation"} qclust <- quickCluster(umi.qc, min.size = 30) umi.qc <- computeSumFactors(umi.qc, sizes = 15, clusters = qclust) umi.qc <- normalize(umi.qc) @@ -164,7 +164,7 @@ plotPCA( shape_by = "individual" ) ``` -```{r norm-ours-rle-scran, fig.cap = "Cell-wise RLE of the tung data"} +```{r exprs-norm11, fig.cap = "Cell-wise RLE of the tung data"} plotRLE( umi.qc[endog_genes, ], exprs_values = "logcounts_raw", @@ -178,14 +178,14 @@ plotRLE( ``` scran sometimes calculates negative or zero size factors. These will completely distort the normalized expression matrix. We can check the size factors scran has computed like so: -```{r} +```{r exprs-norm12} summary(sizeFactors(umi.qc)) ``` For this dataset all the size factors are reasonable so we are done. If you find scran has calculated negative size factors try increasing the cluster and pool sizes until they are all positive. ### Downsampling -```{r norm-pca-downsample, fig.cap = "PCA plot of the tung data after downsampling"} +```{r exprs-norm13, fig.cap = "PCA plot of the tung data after downsampling"} logcounts(umi.qc) <- log2(Down_Sample_Matrix(counts(umi.qc)) + 1) plotPCA( umi.qc[endog_genes, ], @@ -194,7 +194,7 @@ plotPCA( shape_by = "individual" ) ``` -```{r norm-ours-rle-downsample, fig.cap = "Cell-wise RLE of the tung data"} +```{r exprs-norm14, fig.cap = "Cell-wise RLE of the tung data"} plotRLE( umi.qc[endog_genes, ], exprs_values = "logcounts_raw", @@ -224,21 +224,21 @@ gene/transcript is sequenced, not the entire length. If in doubt check for a relationship between gene/transcript length and expression level. However, here we show how these normalisations can be calculated using `scater`. First, we need to find the effective transcript length in Kilobases. However, our dataset containes only gene IDs, therefore we will be using the gene lengths instead of transcripts. `scater` uses the [biomaRt](https://bioconductor.org/packages/release/bioc/html/biomaRt.html) package, which allows one to annotate genes by other attributes: -```{r, message = FALSE, warning = FALSE} -umi.qc <- getBMFeatureAnnos( - umi.qc, - filters = "ensembl_gene_id", - attributes = c( - "ensembl_gene_id", - "hgnc_symbol", - "chromosome_name", - "start_position", - "end_position" - ), - biomart = "ENSEMBL_MART_ENSEMBL", - dataset = "hsapiens_gene_ensembl", - host = "www.ensembl.org" -) +```{r exprs-norm15, message = FALSE, warning = FALSE} +#umi.qc <- getBMFeatureAnnos( +# umi.qc, +# filters = "ensembl_gene_id", +# attributes = c( +# "ensembl_gene_id", +# "hgnc_symbol", +# "chromosome_name", +# "start_position", +# "end_position" +# ), +# biomart = "ENSEMBL_MART_ENSEMBL", +# dataset = "hsapiens_gene_ensembl", +# host = "www.ensembl.org" +#) # If you have mouse data, change the arguments based on this example: # getBMFeatureAnnos( @@ -261,18 +261,18 @@ umi.qc <- getBMFeatureAnnos( ``` Some of the genes were not annotated, therefore we filter them out: -```{r} +```{r exprs-norm16} umi.qc.ann <- umi.qc[!is.na(rowData(umi.qc)$ensembl_gene_id), ] ``` Now we compute the total gene length in Kilobases by using the `end_position` and `start_position` fields: -```{r} +```{r exprs-norm17} eff_length <- abs(rowData(umi.qc.ann)$end_position - rowData(umi.qc.ann)$start_position) / 1000 ``` -```{r length-vs-mean, fig.cap = "Gene length vs Mean Expression for the raw data"} -plot(eff_length, rowMeans(counts(umi.qc.ann))) +```{r exprs-norm18, fig.cap = "Gene length vs Mean Expression for the raw data"} +#plot(eff_length, rowMeans(counts(umi.qc.ann))) ``` There is no relationship between gene length and mean expression so __FPKM__s & __TPM__s are inappropriate for this dataset. But we will demonstrate them anyway. @@ -280,39 +280,39 @@ But we will demonstrate them anyway. __Note__ Here calculate the total gene length instead of the total exon length. Many genes will contain lots of introns so their `eff_length` will be very different from what we have calculated. Please consider our calculation as approximation. If you want to use the total exon lengths, please refer to [this page](https://www.biostars.org/p/83901/). Now we are ready to perform the normalisations: -```{r} +```{r exprs-norm19} tpm(umi.qc.ann) <- log2(calculateTPM(umi.qc.ann, eff_length) + 1) ``` Plot the results as a PCA plot: -```{r norm-pca-fpkm, fig.cap = "PCA plot of the tung data after TPM normalisation"} -tmp <- runPCA( - umi.qc.ann, - exprs_values = "tpm", -) -plotPCA( - tmp, - colour_by = "batch", - size_by = "total_features_by_counts", - shape_by = "individual" -) +```{r exprs-norm20, fig.cap = "PCA plot of the tung data after TPM normalisation"} +#tmp <- runPCA( +# umi.qc.ann, +# exprs_values = "tpm", +#) +#plotPCA( +# tmp, +# colour_by = "batch", +# size_by = "total_features_by_counts", +# shape_by = "individual" +#) ``` -```{r} +```{r exprs-norm21} tpm(umi.qc.ann) <- log2(calculateFPKM(umi.qc.ann, eff_length) + 1) ``` -```{r norm-pca-tpm, fig.cap = "PCA plot of the tung data after FPKM normalisation"} -tmp <- runPCA( - umi.qc.ann, - exprs_values = "tpm", -) -plotPCA( - tmp, - colour_by = "batch", - size_by = "total_features_by_counts", - shape_by = "individual" -) +```{r exprs-norm22, fig.cap = "PCA plot of the tung data after FPKM normalisation"} +#tmp <- runPCA( +# umi.qc.ann, +# exprs_values = "tpm", +#) +#plotPCA( +# tmp, +# colour_by = "batch", +# size_by = "total_features_by_counts", +# shape_by = "individual" +#) ``` __Note__ The `PCA` looks for differences between cells. Gene length is the same across cells for each gene thus __FPKM__ is almost identical to the __CPM__ plot (it is just rotated) since it performs __CPM__ first then normalizes gene length. Whereas, __TPM__ is different because it weights genes by their length before performing __CPM__. @@ -323,6 +323,6 @@ Perform the same analysis with read counts of the `tung` data. Use `tung/reads.r ### sessionInfo() -```{r echo=FALSE} +```{r exprs-norm23, echo=FALSE} sessionInfo() ``` diff --git a/course_files/exprs-overview-reads.Rmd b/course_files/exprs-overview-reads.Rmd index 27ccce0c5..a88492e63 100644 --- a/course_files/exprs-overview-reads.Rmd +++ b/course_files/exprs-overview-reads.Rmd @@ -4,7 +4,7 @@ output: html_document ## Data visualization (Reads) -```{r, message=FALSE, warning=FALSE} +```{r exprs-over-read0, message=FALSE, warning=FALSE} library(scater) options(stringsAsFactors = FALSE) reads <- readRDS("data/tung/reads.rds") @@ -12,12 +12,12 @@ reads.qc <- reads[rowData(reads)$use, colData(reads)$use] endog_genes <- !rowData(reads.qc)$is_feature_control ``` -```{r, echo=FALSE} +```{r exprs-over-read1, echo=FALSE} library(knitr) -opts_chunk$set(out.width='90%', fig.align = 'center') +opts_chunk$set(cache = TRUE, out.width='90%', fig.align = 'center') ``` -```{r expr-overview-pca-before-qc-reads1, fig.cap = "PCA plot of the tung data"} +```{r exprs-over-read2, fig.cap = "PCA plot of the tung data"} tmp <- runPCA( reads[endog_genes, ], exprs_values = "counts" @@ -30,7 +30,7 @@ plotPCA( ) ``` -```{r expr-overview-pca-before-qc-reads2, fig.cap = "PCA plot of the tung data"} +```{r exprs-over-read3, fig.cap = "PCA plot of the tung data"} tmp <- runPCA( reads[endog_genes, ], exprs_values = "logcounts_raw" @@ -43,7 +43,7 @@ plotPCA( ) ``` -```{r expr-overview-pca-after-qc-reads, fig.cap = "PCA plot of the tung data"} +```{r exprs-over-read4, fig.cap = "PCA plot of the tung data"} tmp <- runPCA( reads.qc[endog_genes, ], exprs_values = "logcounts_raw" @@ -56,7 +56,7 @@ plotPCA( ) ``` -```{r expr-overview-tsne-before-qc-reads, fig.cap = "tSNE map of the tung data"} +```{r exprs-over-read5, fig.cap = "tSNE map of the tung data"} set.seed(123456) tmp <- runTSNE( reads[endog_genes, ], @@ -71,7 +71,7 @@ plotTSNE( ) ``` -```{r expr-overview-tsne-after-qc-reads, fig.cap = "tSNE map of the tung data"} +```{r exprs-over-read6, fig.cap = "tSNE map of the tung data"} set.seed(123456) tmp <- runTSNE( reads.qc[endog_genes, ], @@ -86,7 +86,7 @@ plotTSNE( ) ``` -```{r expr-overview-tsne-after-qc-exercise2-1, fig.cap = "tSNE map of the tung data (perplexity = 10)", echo=FALSE} +```{r exprs-over-read7, fig.cap = "tSNE map of the tung data (perplexity = 10)", echo=FALSE} set.seed(123456) tmp <- runTSNE( reads.qc[endog_genes, ], @@ -101,7 +101,7 @@ plotTSNE( ) ``` -```{r expr-overview-tsne-after-qc-exercise2-2, fig.cap = "tSNE map of the tung data (perplexity = 200)", echo=FALSE} +```{r exprs-over-read8, fig.cap = "tSNE map of the tung data (perplexity = 200)", echo=FALSE} set.seed(123456) tmp <- runTSNE( reads.qc[endog_genes, ], @@ -116,6 +116,6 @@ plotTSNE( ) ``` -```{r} +```{r exprs-over-read9} sessionInfo() ``` diff --git a/course_files/exprs-overview.Rmd b/course_files/exprs-overview.Rmd index ac8e2df9f..cc8ffd29f 100644 --- a/course_files/exprs-overview.Rmd +++ b/course_files/exprs-overview.Rmd @@ -10,12 +10,12 @@ In this chapter we will continue to work with the filtered `Tung` dataset produc One important aspect of single-cell RNA-seq is to control for batch effects. Batch effects are technical artefacts that are added to the samples during handling. For example, if two sets of samples were prepared in different labs or even on different days in the same lab, then we may observe greater similarities between the samples that were handled together. In the worst case scenario, batch effects may be [mistaken](http://f1000research.com/articles/4-121/v1) for true biological variation. The `Tung` data allows us to explore these issues in a controlled manner since some of the salient aspects of how the samples were handled have been recorded. Ideally, we expect to see batches from the same individual grouping together and distinct groups corresponding to each individual. -```{r, echo=FALSE} +```{r exprs-over0, echo=FALSE} library(knitr) -opts_chunk$set(out.width='90%', fig.align = 'center') +opts_chunk$set(cache = TRUE, out.width='90%', fig.align = 'center') ``` -```{r, message=FALSE, warning=FALSE} +```{r exprs-over1, message=FALSE, warning=FALSE} library(SingleCellExperiment) library(scater) options(stringsAsFactors = FALSE) @@ -32,14 +32,14 @@ The easiest way to overview the data is by transforming it using the principal c Mathematically, the PCs correspond to the eigenvectors of the covariance matrix. The eigenvectors are sorted by eigenvalue so that the first principal component accounts for as much of the variability in the data as possible, and each succeeding component in turn has the highest variance possible under the constraint that it is orthogonal to the preceding components (the figure below is taken from [here](http://www.nlpca.org/pca_principal_component_analysis.html)). -```{r clust-pca, echo=FALSE, fig.cap="Schematic representation of PCA dimensionality reduction", out.width='100%'} +```{r exprs-over2, echo=FALSE, fig.cap="Schematic representation of PCA dimensionality reduction", out.width='100%'} knitr::include_graphics("figures/pca.png") ``` #### Before QC Without log-transformation: -```{r expr-overview-pca-before-qc1, fig.cap = "PCA plot of the tung data"} +```{r exprs-over3, fig.cap = "PCA plot of the tung data"} tmp <- runPCA( umi[endog_genes, ], exprs_values = "counts" @@ -53,7 +53,7 @@ plotPCA( ``` With log-transformation: -```{r expr-overview-pca-before-qc2, fig.cap = "PCA plot of the tung data"} +```{r exprs-over4, fig.cap = "PCA plot of the tung data"} tmp <- runPCA( umi[endog_genes, ], exprs_values = "logcounts_raw" @@ -72,7 +72,7 @@ __However, note that just a log-transformation is not enough to account for diff #### After QC -```{r expr-overview-pca-after-qc, fig.cap = "PCA plot of the tung data"} +```{r exprs-over5, fig.cap = "PCA plot of the tung data"} tmp <- runPCA( umi.qc[endog_genes, ], exprs_values = "logcounts_raw" @@ -96,7 +96,7 @@ __Hint__ Use `ntop` argument of the `plotPCA` function. __Our answer__ -```{r expr-overview-pca-after-qc-exercise1-1, fig.cap = "PCA plot of the tung data (14214 genes)", echo=FALSE} +```{r exprs-over6, fig.cap = "PCA plot of the tung data (14214 genes)", echo=FALSE} tmp <- runPCA( umi.qc[endog_genes, ], exprs_values = "logcounts_raw", @@ -110,7 +110,7 @@ plotPCA( ) ``` -```{r expr-overview-pca-after-qc-exercise1-2, fig.cap = "PCA plot of the tung data (50 genes)", echo=FALSE} +```{r exprs-over7, expr-overview-pca-after-qc-exercise1-2, fig.cap = "PCA plot of the tung data (50 genes)", echo=FALSE} tmp <- runPCA( umi.qc[endog_genes, ], exprs_values = "logcounts_raw", @@ -133,7 +133,7 @@ An alternative to PCA for visualizing scRNASeq data is a tSNE plot. [tSNE](https #### Before QC -```{r expr-overview-tsne-before-qc, fig.cap = "tSNE map of the tung data"} +```{r exprs-over8, fig.cap = "tSNE map of the tung data"} set.seed(123456) tmp <- runTSNE( umi[endog_genes, ], @@ -150,7 +150,7 @@ plotTSNE( #### After QC -```{r expr-overview-tsne-after-qc, fig.cap = "tSNE map of the tung data"} +```{r exprs-over9, fig.cap = "tSNE map of the tung data"} set.seed(123456) tmp <- runTSNE( umi.qc[endog_genes, ], @@ -176,7 +176,7 @@ How do the tSNE plots change when a perplexity of 10 or 200 is used? How does th __Our answer__ -```{r expr-overview-tsne-after-qc-exercise2-1, fig.cap = "tSNE map of the tung data (perplexity = 10)", echo=FALSE} +```{r exprs-over10, fig.cap = "tSNE map of the tung data (perplexity = 10)", echo=FALSE} set.seed(123456) tmp <- runTSNE( umi.qc[endog_genes, ], @@ -191,7 +191,7 @@ plotTSNE( ) ``` -```{r expr-overview-tsne-after-qc-exercise2-2, fig.cap = "tSNE map of the tung data (perplexity = 200)", echo=FALSE} +```{r exprs-over11, fig.cap = "tSNE map of the tung data (perplexity = 200)", echo=FALSE} set.seed(123456) tmp <- runTSNE( umi.qc[endog_genes, ], @@ -212,6 +212,6 @@ Perform the same analysis with read counts of the Blischak data. Use `tung/reads ### sessionInfo() -```{r echo=FALSE} +```{r exprs-over12, echo=FALSE} sessionInfo() ``` diff --git a/course_files/exprs-qc-reads.Rmd b/course_files/exprs-qc-reads.Rmd index 774657249..6bb463505 100644 --- a/course_files/exprs-qc-reads.Rmd +++ b/course_files/exprs-qc-reads.Rmd @@ -4,40 +4,40 @@ output: html_document ## Expression QC (Reads) -```{r, echo=FALSE} +```{r exprs-qc-reads0, echo=FALSE} library(knitr) -opts_chunk$set(out.width='90%', fig.align = 'center') +opts_chunk$set(cache = TRUE, out.width='90%', fig.align = 'center') ``` -```{r, message=FALSE, warning=FALSE} +```{r exprs-qc-reads1, message=FALSE, warning=FALSE} library(SingleCellExperiment) library(scater) options(stringsAsFactors = FALSE) ``` -```{r} +```{r exprs-qc-reads2} reads <- read.table("data/tung/reads.txt", sep = "\t") anno <- read.table("data/tung/annotation.txt", sep = "\t", header = TRUE) ``` -```{r} +```{r exprs-qc-reads3} head(reads[ , 1:3]) head(anno) ``` -```{r} +```{r exprs-qc-reads4} reads <- SingleCellExperiment( assays = list(counts = as.matrix(reads)), colData = anno ) ``` -```{r} +```{r exprs-qc-reads5} keep_feature <- rowSums(counts(reads) > 0) > 0 reads <- reads[keep_feature, ] ``` -```{r} +```{r exprs-qc-reads6} isSpike(reads, "ERCC") <- grepl("^ERCC-", rownames(reads)) isSpike(reads, "MT") <- rownames(reads) %in% c("ENSG00000198899", "ENSG00000198727", "ENSG00000198888", @@ -47,7 +47,7 @@ isSpike(reads, "MT") <- rownames(reads) %in% "ENSG00000198840") ``` -```{r} +```{r exprs-qc-reads7} reads <- calculateQCMetrics( reads, feature_controls = list( @@ -57,7 +57,7 @@ reads <- calculateQCMetrics( ) ``` -```{r total-counts-hist-reads, fig.cap = "Histogram of library sizes for all cells"} +```{r exprs-qc-reads8, fig.cap = "Histogram of library sizes for all cells"} hist( reads$total_counts, breaks = 100 @@ -65,15 +65,15 @@ hist( abline(v = 1.3e6, col = "red") ``` -```{r} +```{r exprs-qc-reads9} filter_by_total_counts <- (reads$total_counts > 1.3e6) ``` -```{r} +```{r exprs-qc-reads10} table(filter_by_total_counts) ``` -```{r total-features-hist-reads, fig.cap = "Histogram of the number of detected genes in all cells"} +```{r exprs-qc-reads11, fig.cap = "Histogram of the number of detected genes in all cells"} hist( reads$total_features_by_counts, breaks = 100 @@ -81,15 +81,15 @@ hist( abline(v = 7000, col = "red") ``` -```{r} +```{r exprs-qc-reads12} filter_by_expr_features <- (reads$total_features_by_counts > 7000) ``` -```{r} +```{r exprs-qc-reads13} table(filter_by_expr_features) ``` -```{r mt-vs-counts-reads, fig.cap = "Percentage of counts in MT genes"} +```{r exprs-qc-reads14, fig.cap = "Percentage of counts in MT genes"} plotColData( reads, x = "total_features_by_counts", @@ -98,7 +98,7 @@ plotColData( ) ``` -```{r ercc-vs-counts-reads, fig.cap = "Percentage of counts in ERCCs"} +```{r exprs-qc-reads15, fig.cap = "Percentage of counts in ERCCs"} plotColData( reads, x = "total_features_by_counts", @@ -107,7 +107,7 @@ plotColData( ) ``` -```{r} +```{r exprs-qc-reads16} filter_by_ERCC <- reads$batch != "NA19098.r2" & reads$pct_counts_ERCC < 25 table(filter_by_ERCC) @@ -115,7 +115,7 @@ filter_by_MT <- reads$pct_counts_MT < 30 table(filter_by_MT) ``` -```{r} +```{r exprs-qc-reads17} reads$use <- ( # sufficient features (genes) filter_by_expr_features & @@ -128,11 +128,11 @@ reads$use <- ( ) ``` -```{r} +```{r exprs-qc-reads18} table(reads$use) ``` -```{r auto-cell-filt-reads, fig.align='center', fig.cap="PCA plot used for automatic detection of cell outliers", message=FALSE, warning=FALSE, out.width='90%'} +```{r exprs-qc-reads19, fig.align='center', fig.cap="PCA plot used for automatic detection of cell outliers", message=FALSE, warning=FALSE, out.width='90%'} reads <- runPCA( reads, use_coldata = TRUE, @@ -141,11 +141,11 @@ reads <- runPCA( reducedDimNames(reads) ``` -```{r} +```{r exprs-qc-reads20} table(reads$outlier) ``` -```{r} +```{r exprs-qc-reads21} plotReducedDim( reads, use_dimred = "PCA_coldata", @@ -155,7 +155,7 @@ plotReducedDim( ) ``` -```{r cell-filt-comp-reads, fig.cap = "Comparison of the default, automatic and manual cell filters"} +```{r exprs-qc-reads22, fig.cap = "Comparison of the default, automatic and manual cell filters"} library(limma) auto <- colnames(reads)[reads$outlier] man <- colnames(reads)[!reads$use] @@ -170,11 +170,11 @@ vennDiagram( ) ``` -```{r top50-gene-expr-reads, fig.cap = "Number of total counts consumed by the top 50 expressed genes", fig.asp = 1} +```{r exprs-qc-reads23, fig.cap = "Number of total counts consumed by the top 50 expressed genes", fig.asp = 1} plotHighestExprs(reads, exprs_values = "counts") ``` -```{r} +```{r exprs-qc-reads24} keep_feature <- nexprs( reads[,colData(reads)$use], byrow = TRUE, @@ -183,25 +183,25 @@ keep_feature <- nexprs( rowData(reads)$use <- keep_feature ``` -```{r} +```{r exprs-qc-reads25} table(keep_feature) ``` -```{r} +```{r exprs-qc-reads26} dim(reads[rowData(reads)$use, colData(reads)$use]) ``` -```{r} +```{r exprs-qc-reads27} assay(reads, "logcounts_raw") <- log2(counts(reads) + 1) reducedDim(reads) <- NULL ``` -```{r} +```{r exprs-qc-reads28} saveRDS(reads, file = "data/tung/reads.rds") ``` By comparing Figure \@ref(fig:cell-filt-comp) and Figure \@ref(fig:cell-filt-comp-reads), it is clear that the reads based filtering removed more cells than the UMI based analysis. If you go back and compare the results you should be able to conclude that the ERCC and MT filters are more strict for the reads-based analysis. -```{r} +```{r exprs-qc-reads29} sessionInfo() ``` diff --git a/course_files/exprs-qc.Rmd b/course_files/exprs-qc.Rmd index 8dfcc09e4..2bc5ed0f3 100644 --- a/course_files/exprs-qc.Rmd +++ b/course_files/exprs-qc.Rmd @@ -23,25 +23,25 @@ University of Chicago. The experiments were carried out on the Fluidigm C1 platform and to facilitate the quantification both unique molecular identifiers (UMIs) and ERCC _spike-ins_ were used. The data files are located in the `tung` folder in your working directory. These files are the copies of the original files made on the 15/03/16. We will use these copies for reproducibility purposes. -```{r, echo=FALSE} +```{r exprs-qc0, echo=FALSE} library(knitr) -opts_chunk$set(out.width='90%', fig.align = 'center') +opts_chunk$set(cache = TRUE, out.width='90%', fig.align = 'center') ``` -```{r, message=FALSE, warning=FALSE} +```{r exprs-qc1, message=FALSE, warning=FALSE} library(SingleCellExperiment) library(scater) options(stringsAsFactors = FALSE) ``` Load the data and annotations: -```{r} +```{r exprs-qc2} molecules <- read.table("data/tung/molecules.txt", sep = "\t") anno <- read.table("data/tung/annotation.txt", sep = "\t", header = TRUE) ``` Inspect a small portion of the expression matrix -```{r} +```{r exprs-qc3} head(molecules[ , 1:3]) head(anno) ``` @@ -49,7 +49,7 @@ head(anno) The data consists of `r length(unique(anno$individual))` individuals and `r length(unique(anno$replicate))` replicates and therefore has `r length(unique(anno$batch))` batches in total. We standardize the analysis by using both `SingleCellExperiment` (SCE) and `scater` packages. First, create the SCE object: -```{r} +```{r exprs-qc4} umi <- SingleCellExperiment( assays = list(counts = as.matrix(molecules)), colData = anno @@ -57,13 +57,13 @@ umi <- SingleCellExperiment( ``` Remove genes that are not expressed in any cell: -```{r} +```{r exprs-qc5} keep_feature <- rowSums(counts(umi) > 0) > 0 umi <- umi[keep_feature, ] ``` Define control features (genes) - ERCC spike-ins and mitochondrial genes ([provided](http://jdblischak.github.io/singleCellSeq/analysis/qc-filter-ipsc.html) by the authors): -```{r} +```{r exprs-qc6} isSpike(umi, "ERCC") <- grepl("^ERCC-", rownames(umi)) isSpike(umi, "MT") <- rownames(umi) %in% c("ENSG00000198899", "ENSG00000198727", "ENSG00000198888", @@ -74,7 +74,7 @@ isSpike(umi, "MT") <- rownames(umi) %in% ``` Calculate the quality metrics: -```{r} +```{r exprs-qc7} umi <- calculateQCMetrics( umi, feature_controls = list( @@ -93,7 +93,7 @@ Next we consider the total number of RNA molecules detected per sample (if we were using read counts rather than UMI counts this would be the total number of reads). Wells with few reads/molecules are likely to have been broken or failed to capture a cell, and should thus be removed. -```{r total-counts-hist, fig.cap = "Histogram of library sizes for all cells"} +```{r exprs-qc8, fig.cap = "Histogram of library sizes for all cells"} hist( umi$total_counts, breaks = 100 @@ -110,7 +110,7 @@ total number of molecules for each cell should follow? __Our answer__ -```{r, echo=FALSE} +```{r exprs-qc9, echo=FALSE} filter_by_total_counts <- (umi$total_counts > 25000) table(filter_by_total_counts) ``` @@ -119,7 +119,7 @@ table(filter_by_total_counts) In addition to ensuring sufficient sequencing depth for each sample, we also want to make sure that the reads are distributed across the transcriptome. Thus, we count the total number of unique genes detected in each sample. -```{r total-features-hist, fig.cap = "Histogram of the number of detected genes in all cells"} +```{r exprs-qc10, fig.cap = "Histogram of the number of detected genes in all cells"} hist( umi$total_features_by_counts, breaks = 100 @@ -141,7 +141,7 @@ How many cells does our filter remove? __Our answer__ -```{r, echo=FALSE} +```{r exprs-qc11, echo=FALSE} filter_by_expr_features <- (umi$total_features_by_counts > 7000) table(filter_by_expr_features) ``` @@ -154,7 +154,7 @@ of RNA in the captured cells. Cells with a high level of _spike-in_ RNAs had low starting amounts of RNA, likely due to the cell being dead or stressed which may result in the RNA being degraded. -```{r mt-vs-counts, fig.cap = "Percentage of counts in MT genes"} +```{r exprs-qc12, fig.cap = "Percentage of counts in MT genes"} plotColData( umi, x = "total_features_by_counts", @@ -163,7 +163,7 @@ plotColData( ) ``` -```{r ercc-vs-counts, fig.cap = "Percentage of counts in ERCCs"} +```{r exprs-qc13, fig.cap = "Percentage of counts in ERCCs"} plotColData( umi, x = "total_features_by_counts", @@ -180,7 +180,7 @@ Create filters for removing batch NA19098.r2 and cells with high expression of m __Our answer__ -```{r, echo=FALSE} +```{r exprs-qc14, echo=FALSE} filter_by_ERCC <- umi$batch != "NA19098.r2" table(filter_by_ERCC) filter_by_MT <- umi$pct_counts_MT < 10 @@ -201,7 +201,7 @@ You would expect to see a group corresponding to the smaller cells (normal) with Now we can define a cell filter based on our previous analysis: -```{r} +```{r exprs-qc15} umi$use <- ( # sufficient features (genes) filter_by_expr_features & @@ -214,7 +214,7 @@ umi$use <- ( ) ``` -```{r} +```{r exprs-qc16} table(umi$use) ``` @@ -233,7 +233,7 @@ By default, the following metrics are used for PCA-based outlier detection: `scater` first creates a matrix where the rows represent cells and the columns represent the different QC metrics. Then, outlier cells can also be identified by using the `mvoutlier` package on the QC metrics for all cells. This will identify cells that have substantially different QC metrics from the others, possibly corresponding to low-quality cells. We can visualize any outliers using a principal components plot as shown below: -```{r auto-cell-filt, fig.align='center', fig.cap="PCA plot used for automatic detection of cell outliers", message=FALSE, warning=FALSE, out.width='90%'} +```{r exprs-qc17, fig.align='center', fig.cap="PCA plot used for automatic detection of cell outliers", message=FALSE, warning=FALSE, out.width='90%'} umi <- runPCA( umi, use_coldata = TRUE, @@ -244,13 +244,13 @@ reducedDimNames(umi) Column subsetting can then be performed based on the `$outlier` slot, which indicates whether or not each cell has been designated as an outlier. Automatic outlier detection can be informative, but a close inspection of QC metrics and tailored filtering for the specifics of the dataset at hand is strongly recommended. -```{r} +```{r exprs-qc18} table(umi$outlier) ``` Then, we can use a PCA plot to see a 2D representation of the cells ordered by their quality metrics. -```{r} +```{r exprs-qc19} plotReducedDim( umi, use_dimred = "PCA_coldata", @@ -270,7 +270,7 @@ __Hint__: Use `vennCounts` and `vennDiagram` functions from the [limma](https:// __Answer__ -```{r cell-filt-comp, fig.cap = "Comparison of the default, automatic and manual cell filters", echo=FALSE} +```{r exprs-qc20, fig.cap = "Comparison of the default, automatic and manual cell filters", echo=FALSE} library(limma) auto <- colnames(umi)[umi$outlier] man <- colnames(umi)[!umi$use] @@ -293,7 +293,7 @@ In addition to removing cells with poor quality, it is usually a good idea to ex It is often instructive to consider the number of reads consumed by the top 50 expressed genes. -```{r top50-gene-expr, fig.cap = "Number of total counts consumed by the top 50 expressed genes", fig.asp = 1} +```{r exprs-qc21, fig.cap = "Number of total counts consumed by the top 50 expressed genes", fig.asp = 1} plotHighestExprs(umi, exprs_values = "counts") ``` @@ -304,7 +304,7 @@ The distributions are relatively flat indicating (but not guaranteeing!) good co It is typically a good idea to remove genes whose expression level is considered __"undetectable"__. We define a gene as detectable if at least two cells contain more than 1 transcript from the gene. If we were considering read counts rather than UMI counts a reasonable threshold is to require at least five reads in at least two cells. However, in both cases the threshold strongly depends on the sequencing depth. It is important to keep in mind that genes must be filtered after cell filtering since some genes may only be detected in poor quality cells (__note__ `colData(umi)$use` filter applied to the `umi` dataset). -```{r} +```{r exprs-qc22} keep_feature <- nexprs( umi[,colData(umi)$use], byrow = TRUE, @@ -313,7 +313,7 @@ keep_feature <- nexprs( rowData(umi)$use <- keep_feature ``` -```{r} +```{r exprs-qc23} table(keep_feature) ``` @@ -323,18 +323,18 @@ Depending on the cell-type, protocol and sequencing depth, other cut-offs may be ### Save the data Dimensions of the QCed dataset (do not forget about the gene filter we defined above): -```{r} +```{r exprs-qc24} dim(umi[rowData(umi)$use, colData(umi)$use]) ``` Let's create an additional slot with log-transformed counts (we will need it in the next chapters) and remove saved PCA results from the `reducedDim` slot: -```{r} +```{r exprs-qc25} assay(umi, "logcounts_raw") <- log2(counts(umi) + 1) reducedDim(umi) <- NULL ``` Save the data: -```{r} +```{r exprs-qc26} saveRDS(umi, file = "data/tung/umi.rds") ``` @@ -344,6 +344,6 @@ Perform exactly the same QC analysis with read counts of the same Blischak data. ### sessionInfo() -```{r echo=FALSE} +```{r exprs-qc27, echo=FALSE} sessionInfo() ``` diff --git a/course_files/ggplot2-pheatmap-PCA.Rmd b/course_files/ggplot2-pheatmap-PCA.Rmd index 960653837..e15942ccd 100644 --- a/course_files/ggplot2-pheatmap-PCA.Rmd +++ b/course_files/ggplot2-pheatmap-PCA.Rmd @@ -2,11 +2,10 @@ output: html_document --- -```{r ggplot2-pheatmap-PCA0, echo=FALSE} +```{r ggplot0, echo=FALSE} library(knitr) opts_chunk$set(cache=TRUE, cache.extra = list(R.version, sessionInfo())) ``` - ## An Introduction to ggplot2 ### What is ggplot2? @@ -24,7 +23,7 @@ ggplot2 is an R package designed by Hadley Wickham which facilitates data plotti The `aes` function specifies how variables in your dataframe map to features on your plot. To understand how this works, let's look at an example: -```{R ggplot2-pheatmap-PCA1, eval=TRUE, message=FALSE} +```{R ggplot1, eval=TRUE, message=FALSE} library(ggplot2) library(tidyverse) set.seed(1) @@ -48,7 +47,7 @@ We can use geoms to specify how we would like data to be displayed on our graphs Let's see how our graph would look as a scatterplot. -```{R ggplot2-pheatmap-PCA2, eval=TRUE} +```{R ggplot2, eval=TRUE} ggplot(data = counts, mapping = aes(x = cell1, y = cell2)) + geom_point() ``` @@ -62,14 +61,14 @@ So far we've been considering the gene counts from 2 of the cells in our datafra At the moment we can't do this because we are treating each individual cell as a variable and assigning that variable to either the x or the y axis. We could create a 10 dimensional graph to plot data from all 10 cells on, but this is a) not possible to do with ggplot and b) not very easy to interpret. What we could do instead is to tidy our data so that we had one variable representing cell ID and another variable representing gene counts, and plot those against each other. In code, this would look like: -```{R ggplot2-pheatmap-PCA3, eval=TRUE} +```{R ggplot3, eval=TRUE} counts<-gather(counts, colnames(counts)[2:11], key = 'Cell_ID', value='Counts') head(counts) ``` Essentially, the problem before was that our data was not tidy because one variable (Cell_ID) was spread over multiple columns. Now that we've fixed this problem, it is much easier for us to plot data from all 10 cells on one graph. -```{R ggplot2-pheatmap-PCA4, eval=TRUE} +```{R ggplot4, eval=TRUE} ggplot(counts,aes(x=Cell_ID, y=Counts)) + geom_boxplot() ``` @@ -81,7 +80,7 @@ Task 4: Use the updated `counts` dataframe to plot a scatterplot with Gene_ids a A common method for visualising gene expression data is with a heatmap. Here we will use the R package `pheatmap` to perform this analysis with some gene expression data we will name `test`. -```{R ggplot2-pheatmap-PCA5, eval=TRUE, message=FALSE} +```{R ggplot5, eval=TRUE, message=FALSE} library(pheatmap) set.seed(2) test = matrix(rnorm(200), 20, 10) @@ -99,7 +98,7 @@ This plot also gives us information on the results of a clustering algorithm. In If we look closely at the trees, we can see that eventually they have the same number of branches as there are cells and genes. In other words, the total number of cell clusters is the same as the total number of cells, and the total number of gene clusters is the same as the total number of genes. Clearly, this is not very informative, and will become impractical when we are looking at more than 10 cells and 20 genes. Fortunately, we can set the number of clusters we see on the plot. Let's try setting the number of gene clusters to 2: -```{R ggplot2-pheatmap-PCA6, eval=TRUE} +```{R ggplot6, eval=TRUE} pheatmap(test, kmeans_k = 2) ``` @@ -115,7 +114,7 @@ PCA plots are a good way to get an overview of your data, and can sometimes help Let's make a PCA plot for our `test` data. We can use the `ggfortify` package to let ggplot know how to interpret principle components. -```{R ggplot2-pheatmap-PCA7,eval=TRUE, message=FALSE} +```{R ggplot7, eval=TRUE, message=FALSE} library(ggfortify) Principal_Components<-prcomp(test) autoplot(Principal_Components, label=TRUE) @@ -125,7 +124,7 @@ Task 6: Compare your clusters to the pheatmap clusters. Are they related? (Hint: Task 7: Produce a heatmap and PCA plot for `counts` (below): -```{R ggplot2-pheatmap-PCA8,eval=TRUE} +```{R ggplot8, eval=TRUE} set.seed(1) counts <- as.data.frame(matrix(rpois(100, lambda = 10), ncol=10, nrow=10)) rownames(counts) <- paste("gene", 1:10, sep = "") diff --git a/course_files/ideal-scrnaseq-pipeline.Rmd b/course_files/ideal-scrnaseq-pipeline.Rmd index 03feeb832..44b4f3e6b 100644 --- a/course_files/ideal-scrnaseq-pipeline.Rmd +++ b/course_files/ideal-scrnaseq-pipeline.Rmd @@ -4,9 +4,9 @@ output: html_document # "Ideal" scRNAseq pipeline (as of Oct 2017) -```{r, echo=FALSE} +```{r ideal0, echo=FALSE} library(knitr) -opts_chunk$set(fig.align = "center", echo=FALSE) +opts_chunk$set(cache = TRUE, fig.align = "center", echo=FALSE) ``` ## Experimental Design @@ -35,7 +35,7 @@ opts_chunk$set(fig.align = "center", echo=FALSE) * Transcription factor detection (regulatory networks) require high read depth and most sensitive protocols (i.e. Fluidigm C1) * Cell clustering & cell-type identification benefits from large number of cells and doesn't requireas high sequencing depth (~100,000 reads per cell). -```{r pipeline-batches, out.width = '90%', fig.cap="Appropriate approaches to batch effects in scRNASeq. Red arrows indicate batch effects which are (pale) or are not (vibrant) correctable through batch-correction."} +```{r ideal1, out.width = '90%', fig.cap="Appropriate approaches to batch effects in scRNASeq. Red arrows indicate batch effects which are (pale) or are not (vibrant) correctable through batch-correction."} knitr::include_graphics("figures/Pipeline-batches.png") ``` ## Processing Reads diff --git a/course_files/imputation.Rmd b/course_files/imputation.Rmd index 822f49aac..9eb58a290 100644 --- a/course_files/imputation.Rmd +++ b/course_files/imputation.Rmd @@ -3,11 +3,11 @@ output: html_document --- ## Imputation -```{r, echo=FALSE} +```{r imputation0, echo=FALSE} library(knitr) -opts_chunk$set(fig.align="center") +opts_chunk$set(cache = TRUE, fig.align="center") ``` -```{r, echo=TRUE, message=FALSE, warning=FALSE} +```{r imputation1, echo=TRUE, message=FALSE, warning=FALSE} library(scImpute) library(SC3) library(scater) @@ -52,7 +52,7 @@ imputed all inflated zeros. To test `scImpute`, we use the default parameters and we apply it to the Deng dataset that we have worked with before. scImpute takes a .csv or .txt file as an input: -```{r message=FALSE, warning=FALSE, paged.print=FALSE} +```{r imputation2, message=FALSE, warning=FALSE, paged.print=FALSE} deng <- readRDS("data/deng/deng-reads.rds") write.csv(counts(deng), "deng.csv") scimpute( @@ -67,7 +67,7 @@ scimpute( Now we can compare the results with original data by considering a PCA plot -```{r} +```{r imputation3} res <- read.table("scimpute_count.txt") colnames(res) <- NULL res <- SingleCellExperiment( @@ -85,14 +85,14 @@ Compare this result to the original data in Chapter \@ref(clust-methods). What a We can examine the expression of specific genes to directly see the effect of imputation on the expression distribution. -```{r} +```{r imputation4} plotExpression(res, c("Sox2", "Eomes", "Zscan4d", "Fgf4")) plotExpression(deng, c("Sox2", "Eomes", "Zscan4d", "Fgf4")) ``` To evaluate the impact of the imputation, we use `SC3` to cluster the imputed matrix -```{r message=FALSE, warning=FALSE, paged.print=FALSE} +```{r imputation5, message=FALSE, warning=FALSE, paged.print=FALSE} res <- sc3_estimate_k(res) metadata(res)$sc3$k_estimation res <- sc3(res, ks = 10, n_cores = 1, gene_filter = FALSE) @@ -109,7 +109,7 @@ __Exercise:__ Based on the PCA and the clustering results, do you think that imp We can do the same for DrImpute. DrImpute runs on a log-normalized expression matrix directly in R, we generate this matrix using scater, then run DrImpute. Unlike scImpute, DrImpute considers the consensus imputation across a range of ks using two differ correlation distances: -```{r} +```{r imputation6} deng <- normalize(deng) res <- DrImpute(deng@assays[["logcounts"]], ks=8:12) colnames(res) <- colnames(deng) @@ -141,11 +141,11 @@ within the dataset. Since it is based on a diffusion process, it specifically enhances trajectory-like structure in a dataset, in contrast to scImpute and DrImpute which assume a cluster-like structure to the underlying data. -```{r, eval=FALSE} +```{r imputation7, eval=FALSE} res <- magic(t(deng@assays[["logcounts"]]), genes="all_genes", k=10, t="auto") ``` -```{r, eval=FALSE} +```{r imputation8, eval=FALSE} res <- t(as.matrix(res)) rownames(res) <- rownames(deng) colnames(res) <- colnames(deng) @@ -163,7 +163,7 @@ plotPCA( Compare this result to the original data in Chapter \@ref(clust-methods). What are the most significant differences? To evaluate the impact of the imputation, we use `SC3` to cluster the imputed matrix -```{r message=FALSE, warning=FALSE, paged.print=FALSE, eval=FALSE} +```{r imputation9, message=FALSE, warning=FALSE, paged.print=FALSE, eval=FALSE} res <- sc3_estimate_k(res) metadata(res)$sc3$k_estimation res <- sc3(res, ks = 10, n_cores = 1, gene_filter = FALSE) @@ -183,6 +183,6 @@ __Exercise:__ What is the difference between `scImpute` and `MAGIC` based on the ### sessionInfo() -```{r echo=FALSE} +```{r imputation10, echo=FALSE} sessionInfo() ``` diff --git a/course_files/intro-to-R.Rmd b/course_files/intro-to-R.Rmd index a78e76ca4..957bf6658 100644 --- a/course_files/intro-to-R.Rmd +++ b/course_files/intro-to-R.Rmd @@ -2,9 +2,9 @@ output: html_document --- -```{r intro-to-R0, echo=FALSE} +```{r Intro-R0, echo=FALSE} library(knitr) -opts_chunk$set(out.width='90%', fig.align = 'center', cache=TRUE, cache.extra = list(R.version, sessionInfo())) +opts_chunk$set(cache= TRUE, out.width='90%', fig.align = 'center') ``` # Introduction to R/Bioconductor @@ -15,7 +15,7 @@ opts_chunk$set(out.width='90%', fig.align = 'center', cache=TRUE, cache.extra = The Comprehensive R Archive Network [CRAN](https://cran.r-project.org/) is the biggest archive of R packages. There are few requirements for uploading packages besides building and installing succesfully, hence documentation and support is often minimal and figuring how to use these packages can be a challenge it itself. CRAN is the default repository R will search to find packages to install: -```{r intro-to-R1, eval=FALSE} +```{r Intro-R1, eval=FALSE} install.packages("devtools") require("devtools") ``` @@ -24,13 +24,13 @@ require("devtools") [Github](https://github.com/) isn't specific to R, any code of any type in any state can be uploaded. There is no guarantee a package uploaded to github will even install, nevermind do what it claims to do. R packages can be downloaded and installed directly from github using the "devtools" package installed above. -```{r intro-to-R2, eval=FALSE} +```{r Intro-R2, eval=FALSE} devtools::install_github("tallulandrews/M3Drop") ``` Github is also a version control system which stores multiple versions of any package. By default the most recent "master" version of the package is installed. If you want an older version or the development branch this can be specified using the "ref" parameter: -```{r intro-to-R3, eval=FALSE} +```{r Intro-R3, eval=FALSE} # different branch devtools::install_github("tallulandrews/M3D", ref="nbumi") # previous commit @@ -42,7 +42,7 @@ Note: make sure you re-install the M3Drop master branch for later in the course. Bioconductor is a repository of R-packages specifically for biological analyses. It has the strictest requirements for submission, including installation on every platform and full documentation with a tutorial (called a vignette) explaining how the package should be used. Bioconductor also encourages utilization of standard data structures/classes and coding style/naming conventions, so that, in theory, packages and analyses can be combined into large pipelines or workflows. -```{r intro-to-R4, eval=FALSE} +```{r Intro-R4, eval=FALSE} source("https://bioconductor.org/biocLite.R") biocLite("edgeR") ``` @@ -51,7 +51,7 @@ Note: in some situations it is necessary to substitute "http://" for "https://" Bioconductor also requires creators to support their packages and has a regular 6-month release schedule. Make sure you are using the most recent release of bioconductor before trying to install packages for the course. -```{r intro-to-R5, eval=FALSE} +```{r Intro-R5, eval=FALSE} source("https://bioconductor.org/biocLite.R") biocLite("BiocUpgrade") ``` @@ -59,7 +59,7 @@ biocLite("BiocUpgrade") ### Source The final way to install packages is directly from source. In this case you have to download a fully built source code file, usually packagename.tar.gz, or clone the github repository and rebuild the package yourself. Generally this will only be done if you want to edit a package yourself, or if for some reason the former methods have failed. -```{r intro-to-R6, eval=FALSE} +```{r Intro-R6, eval=FALSE} install.packages("M3Drop_3.05.00.tar.gz", type="source") ``` @@ -76,7 +76,7 @@ Aside: R can also store data as "complex" for complex numbers but generally this The "numeric" class is the default class for storing any numeric data - integers, decimal numbers, numbers in scientific notation, etc... -```{r intro-to-R7} +```{r Intro-R7} x = 1.141 class(x) y = 42 @@ -87,7 +87,7 @@ class(z) Here we see that even though R has an "integer" class and 42 could be stored more efficiently as an integer the default is to store it as "numeric". If we want 42 to be stored as an integer we must "coerce" it to that class: -```{r intro-to-R8} +```{r Intro-R8} y = as.integer(42) class(y) ``` @@ -95,7 +95,7 @@ class(y) Coercion will force R to store data as a particular class, if our data is incompatible with that class it will still do it but the data will be converted to NAs: -```{r intro-to-R9} +```{r Intro-R9} as.numeric("H") ``` @@ -104,7 +104,7 @@ Above we tried to coerce "character" data, identified by the double quotation ma ### Character/String The "character" class stores all kinds of text data. Programing convention calls data containing multiple letters a "string", thus most R functions which act on character data will refer to the data as "strings" and will often have "str" or "string" in it's name. Strings are identified by being flanked by double quotation marks, whereas variable/function names are not: -```{r intro-to-R10} +```{r Intro-R10} x = 5 a = "x" # character "x" @@ -115,7 +115,7 @@ b ``` In addition to standard alphanumeric characters, strings can also store various special characters. Special characters are identified using a backlash followed by a single character, the most relevant are the special character for tab : `\t` and new line : `\n`. To demonstrate the these special characters lets concatenate (cat) together two strings with these characters separating (sep) them: -```{r intro-to-R11} +```{r Intro-R11} cat("Hello", "World", sep= " ") cat("Hello", "World", sep= "\t") @@ -124,7 +124,7 @@ cat("Hello", "World", sep= "\n") ``` Note that special characters work differently in different functions. For instance the `paste` function does the same thing as `cat` but does not recognize special characters. -```{r intro-to-R12} +```{r Intro-R12} paste("Hello", "World", sep= " ") paste("Hello", "World", sep= "\t") @@ -134,18 +134,18 @@ paste("Hello", "World", sep= "\n") Single or double backslash is also used as an `escape` character to turn off special characters or allow quotation marks to be included in strings: -```{r intro-to-R13} +```{r Intro-R13} cat("This \"string\" contains quotation marks.") ``` Special characters are generally only used in pattern matching, and reading/writing data to files. For instance this is how you would read a tab-separated file into R. -```{r intro-to-R14, eval=FALSE} +```{r Intro-R14, eval=FALSE} dat = read.delim("file.tsv", sep="\t") ``` Another special type of character data are colours. Colours can be specified in three main ways: by name from those [available](http://bxhorn.com/r-color-tables/), by red, green, blue values using the `rgb` function, and by hue (colour), saturation (colour vs white) and value (colour/white vs black) using the `hsv` function. By default rgb and hsv expect three values in 0-1 with an optional fourth value for transparency. Alternatively, sets of predetermined colours with useful properties can be loaded from many different packages with [RColorBrewer](http://colorbrewer2.org/) being one of the most popular. -```{r intro-to-R15} +```{r Intro-R15} reds = c("red", rgb(1,0,0), hsv(0, 1, 1)) reds barplot(c(1,1,1), col=reds, names=c("by_name", "by_rgb", "by_hsv")) @@ -155,7 +155,7 @@ barplot(c(1,1,1), col=reds, names=c("by_name", "by_rgb", "by_hsv")) The `logical` class stores boolean truth values, i.e. TRUE and FALSE. It is used for storing the results of logical operations and conditional statements will be coerced to this class. Most other data-types can be coerced to boolean without triggering (or "throwing") error messages, which may cause unexpected behaviour. -```{r intro-to-R16} +```{r Intro-R16} x = TRUE class(x) @@ -182,7 +182,7 @@ Do you ever throw a warning/error message? String/Character data is very memory inefficient to store, each letter generally requires the same amount of memory as any integer. Thus when storing a vector of strings with repeated elements it is more efficient assign each element to an integer and store the vector as integers and an additional string-to-integer association table. Thus, by default R will read in text columns of a data table as factors. -```{r intro-to-R17} +```{r Intro-R17} str_vector = c("Apple", "Apple", "Banana", "Banana", "Banana", "Carrot", "Carrot", "Apple", "Banana") factored_vector = factor(str_vector) factored_vector @@ -191,12 +191,12 @@ as.numeric(factored_vector) The double nature of factors can cause some unintuitive behaviour. E.g. joining two factors together will convert them to the numeric form and the original strings will be lost. -```{r intro-to-R18} +```{r Intro-R18} c(factored_vector, factored_vector) ``` Likewise if due to formatting issues numeric data is mistakenly interpretted as strings, then you must convert the factor back to strings before coercing to numeric values: -```{r intro-to-R19} +```{r Intro-R19} x = c("20", "25", "23", "38", "20", "40", "25", "30") x = factor(x) as.numeric(x) @@ -205,13 +205,13 @@ as.numeric(as.character(x)) To make R read text as character data instead of factors set the environment option `stringsAsFactors=FALSE`. This must be done at the start of each R session. -```{r intro-to-R20} +```{r Intro-R20} options(stringsAsFactors=FALSE) ``` __Exercise__ How would you use factors to create a vector of colours for an arbitrarily long vector of fruits like `str_vector` above? __Answer__ -```{r intro-to-R21, include=FALSE} +```{r Intro-R21, include=FALSE} long_str_vector = c(str_vector, str_vector, str_vector) fruit_cols = c("red", "yellow", "orange") fruit_colour_vec = fruit_cols[as.numeric(factor(long_str_vector, levels=c("Apple", "Banana", "Carrot")))] @@ -220,7 +220,7 @@ fruit_colour_vec = fruit_cols[as.numeric(factor(long_str_vector, levels=c("Apple ### Checking class/type We recommend checking your data is of the correct class after reading from files: -```{r intro-to-R22} +```{r Intro-R22} x = 1.4 is.numeric(x) is.character(x) @@ -232,7 +232,7 @@ is.factor(x) ## Basic data structures So far we have only looked at single values and vectors. Vectors are the simplest data structure in R. They are a 1-dimensional array of data all of the same type. If the input when creating a vector is of different types it will be coerced to the data-type that is most consistent with the data. -```{r intro-to-R23} +```{r Intro-R23} x = c("Hello", 5, TRUE) x class(x) @@ -242,7 +242,7 @@ Here we tried to put character, numeric and logical data into a single vector so A `matrix` is the two dimensional version of a vector, it also requires all data to be of the same type. If we combine a character vector and a numeric vector into a matrix, all the data will be coerced to characters: -```{r intro-to-R24} +```{r Intro-R24} x = c("A", "B", "C") y = c(1, 2, 3) class(x) @@ -253,7 +253,7 @@ m The quotation marks indicate that the numeric vector has been coerced to characters. Alternatively, to store data with columns of different data-types we can use a dataframe. -```{r intro-to-R25} +```{r Intro-R25} z = data.frame(x, y) z class(z[,1]) @@ -262,7 +262,7 @@ class(z[,2]) If you have set stringsAsFactors=FALSE as above you will find the first column remains characters, otherwise it will be automatically converted to a factor. -```{r intro-to-R26} +```{r Intro-R26} options(stringsAsFactors=TRUE) z = data.frame(x, y) class(z[,1]) @@ -270,14 +270,14 @@ class(z[,1]) Another difference between matrices and dataframes is the ability to select columns using the `$` operator: -```{r intro-to-R27, eval=FALSE} +```{r Intro-R27, eval=FALSE} m$x # throws an error z$x # ok ``` The final basic data structure is the `list`. Lists allow data of different types and different lengths to be stored in a single object. Each element of a list can be any other R object : data of any type, any data structure, even other lists or functions. -```{r intro-to-R28} +```{r Intro-R28} l = list(m, z) ll = list(sublist=l, a_matrix=m, numeric_value=42, this_string="Hello World", even_a_function=cbind) ll diff --git a/course_files/intro.Rmd b/course_files/intro.Rmd index 093fecdc7..182e4b65c 100644 --- a/course_files/intro.Rmd +++ b/course_files/intro.Rmd @@ -4,9 +4,9 @@ output: html_document # Introduction to single-cell RNA-seq -```{r intro0, echo=FALSE, cache=TRUE, cache.extra = list(R.version, sessionInfo())} +```{r Intro0, echo=FALSE} library(knitr) -opts_chunk$set(fig.align = "center", echo=FALSE, cache=TRUE, cache.extra = list(R.version, sessionInfo())) +opts_chunk$set(cache = TRUE, fig.align = "center", echo=FALSE) ``` ## Bulk RNA-seq @@ -32,7 +32,7 @@ opts_chunk$set(fig.align = "center", echo=FALSE, cache=TRUE, cache.extra = list( ## Workflow -```{r intro1, out.width = '90%', fig.cap="Single cell sequencing (taken from Wikipedia)"} +```{r Intro1, out.width = '90%', fig.cap="Single cell sequencing (taken from Wikipedia)"} knitr::include_graphics("figures/RNA-Seq_workflow-5.pdf.jpg") ``` @@ -43,7 +43,7 @@ Overall, experimental scRNA-seq protocols are similar to the methods used for bu This course is concerned with the computational analysis of the data obtained from scRNA-seq experiments. The first steps (yellow) are general for any highthroughput sequencing data. Later steps (orange) require a mix of existing RNASeq analysis methods and novel methods to address the technical difference of scRNASeq. Finally the biological interpretation (blue) __should__ be analyzed with methods specifically developed for scRNASeq. -```{r intro2, out.width = '65%', fig.cap="Flowchart of the scRNA-seq analysis"} +```{r Intro2, out.width = '65%', fig.cap="Flowchart of the scRNA-seq analysis"} knitr::include_graphics("figures/flowchart.png") ``` diff --git a/course_files/process-raw-QC.Rmd b/course_files/process-raw-QC.Rmd index 329b81cbc..befececc8 100644 --- a/course_files/process-raw-QC.Rmd +++ b/course_files/process-raw-QC.Rmd @@ -3,7 +3,7 @@ output: html_document code_folding: hide --- -```{r process-raw-QC0, include=FALSE, cache=TRUE, cache.extra = list(R.version, sessionInfo())} +```{r Raw-QC0, include=FALSE, cache=TRUE, cache.extra = list(R.version, sessionInfo())} library(knitr) opts_chunk$set(cache=TRUE, cache.extra = list(R.version, sessionInfo())) ``` @@ -25,14 +25,14 @@ Today we will be performing our analysis using a single cell from an mESC datase __Note__ The current text of the course is written for an AWS server for people who attend our course in person. You will have to download the files (both `ERR522959_1.fastq` and `ERR522959_2.fastq`) and create `Share` directory yourself to run the commands. You can find the files here: https://www.ebi.ac.uk/arrayexpress/experiments/E-MTAB-2600/samples/ Now let's look at the files: -```{bash process-raw-QC1, eval=FALSE} +```{bash Raw-QC1, eval=FALSE} less Share/ERR522959_1.fastq less Share/ERR522959_2.fastq ``` Task 1: Try to work out what command you should use to produce the FastQC report. Hint: Try executing -```{bash process-raw-QC2, eval=FALSE, collapse=TRUE} +```{bash Raw-QC2, eval=FALSE, collapse=TRUE} fastqc -h ``` @@ -43,7 +43,7 @@ This command will tell you what options are available to pass to FastQC. Feel fr If you haven't done so already, generate the FastQC report using the commands below: -```{bash process-raw-QC3, eval=FALSE, echo = TRUE} +```{bash Raw-QC3, eval=FALSE, echo = TRUE} mkdir fastqc_results fastqc -o fastqc_results Share/ERR522959_1.fastq Share/ERR522959_2.fastq ``` @@ -66,7 +66,7 @@ Now let's try to use Trim Galore! to remove those problematic adapters. It's a g Task 3: Work out the command you should use to trim the adapters from our data. Hint 1: You can use -```{bash process-raw-QC4, eval=FALSE} +```{bash Raw-QC4, eval=FALSE} trim_galore -h ``` @@ -81,7 +81,7 @@ Once you think you have successfully trimmed your reads and have confirmed this You can use the command(s) below to trim the Nextera sequencing adapters: -```{bash process-raw-QC5, eval=FALSE} +```{bash Raw-QC5, eval=FALSE} mkdir fastqc_trimmed_results trim_galore --nextera -o fastqc_trimmed_results Share/ERR522959_1.fastq Share/ERR522959_2.fastq ``` diff --git a/course_files/process-raw-align.Rmd b/course_files/process-raw-align.Rmd index 16f3d1e48..69ef420a0 100644 --- a/course_files/process-raw-align.Rmd +++ b/course_files/process-raw-align.Rmd @@ -2,7 +2,7 @@ output: html_document --- -```{r process-raw-align0, include=FALSE, cache=TRUE, cache.extra = list(R.version, sessionInfo())} +```{r Process-raw-align0, include=FALSE, cache=TRUE, cache.extra = list(R.version, sessionInfo())} library(knitr) opts_chunk$set(cache=TRUE, cache.extra = list(R.version, sessionInfo())) ``` @@ -22,7 +22,7 @@ Two steps are required to perform STAR alignment. In the first step, the user pr Let's create the index now. Remember, for reasons of time we are aligning to a transcriptome rather than a genome today, meaning we only need to provide STAR with the sequences of the transcripts we will be aligning reads to. You can obtain transcriptomes for many model organisms from Ensembl (https://www.ensembl.org/info/data/ftp/index.html). Task 1: Execute the commands below to create the index: -```{bash process-raw-align1, eval=FALSE} +```{bash Process-raw-align1, eval=FALSE} mkdir indices mkdir indices/STAR STAR --runThreadN 4 --runMode genomeGenerate --genomeDir indices/STAR --genomeFastaFiles Share/2000_reference.transcripts.fa @@ -41,7 +41,7 @@ Task 5: Try to understand the output of your alignment. Talk to one of the instr ### Solution for STAR Alignment You can use the folowing commands to perform the mapping step: -```{bash process-raw-align2,eval=FALSE} +```{bash Process-raw-align2, eval=FALSE} mkdir results mkdir results/STAR @@ -83,7 +83,7 @@ As for STAR, you will need to produce an index for Kallisto before the pseudo-al Task 6: Use the below command to produce the Kallisto index. Use the Kallisto manual (https://pachterlab.github.io/kallisto/manual) to work out what the options do in this command. -```{bash process-raw-align3, eval=FALSE} +```{bash Process-raw-align3, eval=FALSE} mkdir indices/Kallisto kallisto index -i indices/Kallisto/transcripts.idx Share/2000_reference.transcripts.fa ``` @@ -94,7 +94,7 @@ Task 7: Use the Kallisto manual to work out what command to use to perform pseud Use the below command to perform pseudo-alignment -```{bash process-raw-align4, eval=FALSE} +```{bash Process-raw-align4, eval=FALSE} mkdir results/Kallisto kallisto pseudo -i indices/Kallisto/transcripts.idx -o results/Kallisto -b batch.txt ``` diff --git a/course_files/process-raw.Rmd b/course_files/process-raw.Rmd index 97fd83c70..99af500f8 100644 --- a/course_files/process-raw.Rmd +++ b/course_files/process-raw.Rmd @@ -2,7 +2,7 @@ output: html_document --- -```{r process-raw0, include=FALSE, cache=TRUE, cache.extra = list(R.version, sessionInfo())} +```{r Process-raw0, include=FALSE, cache=TRUE, cache.extra = list(R.version, sessionInfo())} library(knitr) opts_chunk$set(cache=TRUE, cache.extra = list(R.version, sessionInfo())) ``` @@ -19,7 +19,7 @@ Thus reads will be mapped as if they are single-end sequenced despite actually being paired end. FastQ files have the format: -```{bash process-raw1, eval=FALSE} +```{Process-raw1, eval=FALSE} >ReadID READ SEQUENCE + @@ -59,7 +59,7 @@ Alignment rows employ a standard format with the following columns: BAM/SAM files can be converted to the other format using 'samtools': -```{bash process-raw2, eval=FALSE} +```{bash Process-raw2, eval=FALSE} samtools view -S -b file.sam > file.bam samtools view -h file.bam > file.sam ``` @@ -77,7 +77,7 @@ multi-mapping reads first sort by read name and remove secondary alignments using samtools. [Picard](https://broadinstitute.github.io/picard/index.html) also contains a method for converting BAM to FastQ files. -```{bash process-raw3, eval=FALSE} +```{bash Process-raw3, eval=FALSE} # sort reads by name samtools sort -n original.bam -o sorted_by_name.bam # remove secondary alignments @@ -99,7 +99,7 @@ Alternatively, you may pre-download the correct reference either from metadata i of the CRAM file, or from talking to whomever generated the CRAM and specify that file using '-T' Thus we recommend setting a specific cache location prior to doing this: -```{bash process-raw4, eval=FALSE} +```{bash Process-raw4, eval=FALSE} export REF_CACHE=/path_to/cache_directory_for_reference_genome samtools view -b -h -T reference_genome.fasta file.cram -o file.bam samtools view -C -h -T reference_genome.fasta file.bam -o file.cram @@ -112,7 +112,7 @@ are from the correct sample. 'less' and 'more' can be used to inspect any text f By "pipe-ing" the output of samtools view into these commands using '|' we check each of these file types without having to save multiple copies of each file. -```{bash process-raw5, eval=FALSE} +```{bash Process-raw5, eval=FALSE} less file.txt more file.txt # counts the number of lines in file.txt @@ -137,7 +137,7 @@ by entering running the command "naked" - e.g. 'samtools view', 'bedtools' __Answer__ -```{bash process-raw6, eval=TRUE, include=FALSE} +```{bash Process-raw6, eval=TRUE, include=FALSE} samtools view -T data/2000_reference.transcripts.fa -H data/EXAMPLE.cram | more samtools view -T data/2000_reference.transcripts.fa -f 4 data/EXAMPLE.cram | wc -l # unmapped samtools view -T data/2000_reference.transcripts.fa -F 260 data/EXAMPLE.cram | wc -l # mapped @@ -188,7 +188,7 @@ for any non-standard sequences added. There is no standardized way to do this. So below is our custom perl script for creating a gtf and fasta file for ERCCs which can be appended to the genome. You may also need to alter a gtf file to deal with repetitive elements in introns when/if you want to quantify intronic reads. Any scripting language or even 'awk' and/or some text editors can be used to do this relatively efficiently, but they are beyond the scope of this course. -```{bash process-raw7, eval=FALSE} +```{bash Process-raw7, eval=FALSE} # Converts the Annotation file from # https://www.thermofisher.com/order/catalog/product/4456740 to # gtf and fasta files that can be added to existing genome fasta & gtf files. @@ -244,11 +244,11 @@ the expected barcodes and assign the associated reads to the closest cell-barcod We have [publicly available](https://github.com/tallulandrews/scRNASeqPipeline) perl scripts capable of demultiplexing any scRNASeq data with a single cell-barcode with or without UMIs for plate-based protocols. These can be used as so: -```{bash process-raw8, eval=TRUE} +```{bash Process-raw8, eval=TRUE} perl utils/1_Flexible_UMI_Demultiplexing.pl data/10cells_read1.fq data/10cells_read2.fq "C12U8" data/10cells_barcodes.txt 2 Ex ``` -```{bash process-raw9, eval=TRUE} +```{bash Process-raw9, eval=TRUE} perl utils/1_Flexible_FullTranscript_Demultiplexing.pl data/10cells_read1.fq data/10cells_read2.fq "start" 12 data/10cells_barcodes.txt 2 Ex ``` @@ -271,7 +271,7 @@ barcode and try to find a "break point" between bigger libraries which are cells + some background and smaller libraries assumed to be purely background. Let's load some example simulated data which contain both large and small cells: -```{r process-raw10} +```{r Process-raw10} umi_per_barcode <- read.table("data/droplet_id_example_per_barcode.txt.gz") truth <- read.delim("data/droplet_id_example_truth.gz", sep=",") ``` @@ -282,7 +282,7 @@ To simplify calculations for this section exclude all barcodes with fewer than 10 total molecules. __Answer__ -```{r process-raw11, include=FALSE} +```{r Process-raw11, include=FALSE} dim(umi_per_barcode)[1] dim(truth)[1] umi_per_barcode <- umi_per_barcode[umi_per_barcode[,2] > 10,] @@ -291,7 +291,7 @@ umi_per_barcode <- umi_per_barcode[umi_per_barcode[,2] > 10,] One approach is to look for the inflection point where the total molecules per barcode suddenly drops: -```{r process-raw12} +```{r Process-raw12} barcode_rank <- rank(-umi_per_barcode[,2]) plot(barcode_rank, umi_per_barcode[,2], xlim=c(1,8000)) ``` @@ -299,7 +299,7 @@ plot(barcode_rank, umi_per_barcode[,2], xlim=c(1,8000)) Here we can see an roughly exponential curve of library sizes, so to make things simpler lets log-transform them. -```{r process-raw13} +```{r Process-raw13} log_lib_size <- log10(umi_per_barcode[,2]) plot(barcode_rank, log_lib_size, xlim=c(1,8000)) ``` @@ -307,7 +307,7 @@ That's better, the "knee" in the distribution is much more pronounced. We could manually estimate where the "knee" is but it much more reproducible to algorithmically identify this point. -```{r process-raw14} +```{r Process-raw14} # inflection point o <- order(barcode_rank) log_lib_size <- log_lib_size[o] @@ -329,7 +329,7 @@ c(TPR, Recall) Another is to fix a mixture model and find where the higher and lower distributions intersect. However, data may not fit the assumed distributions very well: -```{r process-raw15} +```{r Process-raw15} set.seed(-92497) # mixture model require("mixtools") @@ -348,7 +348,7 @@ Identify cells using this split point and calculate the TPR and Recall. __Answer__ -```{r process-raw16, include=FALSE} +```{r Process-raw16, include=FALSE} cells <- umi_per_barcode[umi_per_barcode[,2] > 10^split,1] TPR <- sum(cells %in% truth[,1])/length(cells) @@ -359,7 +359,7 @@ c(TPR, Recall) A third, used by CellRanger, assumes a ~10-fold range of library sizes for real cells and estimates this range using the expected number of cells. -```{r process-raw17} +```{r Process-raw17} n_cells <- length(truth[,1]) # CellRanger totals <- umi_per_barcode[,2] @@ -373,7 +373,7 @@ __Exercise__ Identify cells using this threshodl and calculate the TPR and Recall. __Answer__ -```{r process-raw18, include=FALSE} +```{r Process-raw18, include=FALSE} cells <- umi_per_barcode[umi_per_barcode[,2] > thresh,1] TPR <- sum(cells %in% truth[,1])/length(cells) @@ -393,7 +393,7 @@ cells in highly diverse samples. Below we have provided code for how this method is currently run: (We will update this page when the method is officially released) -```{r process-raw19, eval=FALSE} +```{r Proces-raw19, eval=FALSE} require("Matrix") raw.counts <- readRDS("data/droplet_id_example.rds") diff --git a/course_files/projection.Rmd b/course_files/projection.Rmd index dfafafece..9b9b114e6 100644 --- a/course_files/projection.Rmd +++ b/course_files/projection.Rmd @@ -3,13 +3,13 @@ output: html_document --- ## Comparing/Combining scRNASeq datasets -```{r echo=FALSE, message=FALSE, warning=FALSE, paged.print=FALSE} +```{r projection0, echo=FALSE, message=FALSE, warning=FALSE, paged.print=FALSE} library(knitr) library(googleVis) -opts_chunk$set(fig.align="center", dev = 'png') +opts_chunk$set(cache = TRUE, fig.align="center", dev = 'png', cache.lazy = FALSE) op <- options(gvis.plot.tag='chart') ``` -```{r, echo=TRUE, message=FALSE, warning=FALSE} +```{r projection1, echo=TRUE, message=FALSE, warning=FALSE} library(scater) library(SingleCellExperiment) ``` @@ -30,24 +30,24 @@ match in a database for a newly identified nucleotide or amino acid sequence. Th compare datasets of similar biological origin collected by different labs to ensure that the annotation and the analysis is consistent. -```{r, echo=FALSE, merged_seurat.width = '80%', fig.cap="Label-centric dataset comparison can be used to compare the annotations of two different samples."} +```{r projection2, echo=FALSE, merged_seurat.width = '80%', fig.cap="Label-centric dataset comparison can be used to compare the annotations of two different samples."} knitr::include_graphics("figures/CourseCompareTypes.png") ``` -```{r, echo=FALSE, merged_seurat.width = '80%', fig.cap="Label-centric dataset comparison can project cells from a new experiment onto an annotated reference."} +```{r projection3, echo=FALSE, merged_seurat.width = '80%', fig.cap="Label-centric dataset comparison can project cells from a new experiment onto an annotated reference."} knitr::include_graphics("figures/CourseAtlasAssignment.png") ``` The cross-dataset normalization approach can also be used to compare datasets of similar biological origin, unlike the label-centric approach it enables the join analysis of multiple datasets to facilitate the identification of rare cell-types which may to too sparsely sampled in each individual dataset to be reliably detected. However, cross-dataset normalization is not applicable to very large and diverse references since it assumes a significant portion of the biological variablility in each of the datasets overlaps with others. -```{r, echo=FALSE, merged_seurat.width = '80%', fig.cap="Cross-dataset normalization enables joint-analysis of 2+ scRNASeq datasets."} +```{r projection4, echo=FALSE, merged_seurat.width = '80%', fig.cap="Cross-dataset normalization enables joint-analysis of 2+ scRNASeq datasets."} knitr::include_graphics("figures/CourseCrossNorm.png") ``` ### Datasets We will running these methods on two human pancreas datasets: [@Muraro2016-yk] and [@Segerstolpe2016-wc]. Since the pancreas has been widely studied, these datasets are well annotated. -```{r} +```{r projection5} muraro <- readRDS("data/pancreas/muraro.rds") segerstolpe <- readRDS("data/pancreas/segerstolpe.rds") ``` @@ -55,7 +55,7 @@ segerstolpe <- readRDS("data/pancreas/segerstolpe.rds") This data has already been formatted for scmap. Cell type labels must be stored in the `cell_type1` column of the `colData` slots, and gene ids that are consistent across both datasets must be stored in the `feature_symbol` column of the `rowData` slots. First, lets check our gene-ids match across both datasets: -```{r} +```{r projection6} sum(rowData(muraro)$feature_symbol %in% rowData(segerstolpe)$feature_symbol)/nrow(muraro) sum(rowData(segerstolpe)$feature_symbol %in% rowData(muraro)$feature_symbol)/nrow(segerstolpe) ``` @@ -64,14 +64,14 @@ genes in muraro. This is as expected because the segerstolpe dataset was more de However, it highlights some of the difficulties in comparing scRNASeq datasets. We can confirm this by checking the overall size of these two datasets. -```{r} +```{r projection7} dim(muraro) dim(segerstolpe) ``` In addition, we can check the cell-type annotations for each of these dataset using the command below: -```{r} +```{r projection8} summary(factor(colData(muraro)$cell_type1)) summary(factor(colData(segerstolpe)$cell_type1)) ``` @@ -89,14 +89,14 @@ to uncover a novel cell-type among them (or a sub-type within the existing annot To simplify our demonstration analyses we will remove the small classes of unassigned cells, and the poor quality cells. We will retain the "unclassified endocrine" to see if any of these methods can elucidate what cell-type they belong to. -```{r} +```{r projection9} segerstolpe <- segerstolpe[,colData(segerstolpe)$cell_type1 != "unclassified"] segerstolpe <- segerstolpe[,colData(segerstolpe)$cell_type1 != "not applicable",] muraro <- muraro[,colData(muraro)$cell_type1 != "unclear"] ``` ### Projecting cells onto annotated cell-types (scmap) -```{r, echo=TRUE, message=FALSE, warning=FALSE} +```{r projection10, echo=TRUE, message=FALSE, warning=FALSE} library(scmap) set.seed(1234567) ``` @@ -110,25 +110,25 @@ clusters. Since we want to know whether PSCs and mesenchymal cells are synonymou other so we will build an index for each dataset. This requires first selecting the most informative features for the reference dataset. -```{r} +```{r projection11} muraro <- selectFeatures(muraro, suppress_plot = FALSE) ``` Genes highlighted with the red colour will be used in the futher analysis (projection). -```{r} +```{r projection12} segerstolpe <- selectFeatures(segerstolpe, suppress_plot = FALSE) ``` From the y-axis of these plots we can see that scmap uses a dropmerged_seurat-based feature selection method. Now calculate the cell-type index: -```{r} +```{r projection13} muraro <- indexCluster(muraro) segerstolpe <- indexCluster(segerstolpe) ``` We can also visualize the index: -```{r} +```{r projection14} heatmap(as.matrix(metadata(muraro)$scmap_cluster_index)) ``` @@ -138,7 +138,7 @@ __Exercise__ Using the rowData of each dataset how many genes were selected as features in both datasets? What does this tell you abmerged_seurat these datasets? __Answer__ -```{r, include=FALSE, eval=TRUE} +```{r projection15, include=FALSE, eval=TRUE} ans = sum(rowData(muraro)[rowData(muraro)$scmap_features, "feature_symbol"] %in% rowData(segerstolpe)[rowData(segerstolpe)$scmap_features, "feature_symbol"]) print(ans) ``` @@ -148,7 +148,7 @@ print(ans) scmap computes the distance from each cell to each cell-type in the reference index, then applies an empirically derived threshold to determine which cells are assigned to the closest reference cell-type and which are unassigned. To account for differences in sequencing depth distance is calculated using the spearman correlation and cosine distance and only cells with a consistent assignment with both distances are returned as assigned. We will project the `segerstolpe` dataset to `muraro` dataset: -```{r, warning=FALSE} +```{r projection16, warning=FALSE} seger_to_muraro <- scmapCluster( projection = segerstolpe, index_list = list( @@ -159,7 +159,7 @@ seger_to_muraro <- scmapCluster( and `muraro` onto `segerstolpe` -```{r, warning=FALSE} +```{r projection17, warning=FALSE} muraro_to_seger <- scmapCluster( projection = muraro, index_list = list( @@ -170,18 +170,18 @@ muraro_to_seger <- scmapCluster( Note that in each case we are projecting to a single dataset but that this could be extended to any number of datasets for which we have computed indices. Now lets compare the original cell-type labels with the projected labels: -```{r} +```{r projection18} table(colData(muraro)$cell_type1, muraro_to_seger$scmap_cluster_labs) ``` Here we can see that cell-types do map to their equivalents in segerstolpe, and importantly we see that all but one of the "mesenchymal" cells were assigned to the "PSC" class. -```{r} +```{r projection19} table(colData(segerstolpe)$cell_type1, seger_to_muraro$scmap_cluster_labs) ``` Again we see cell-types match each other and that all but one of the "PSCs" match the "mesenchymal" cells providing strong evidence that these two annotations should be considered synonymous. We can also visualize these tables using a [Sankey diagram](https://developers.google.com/chart/interactive/docs/gallery/sankey): -```{r results='asis', tidy=FALSE} +```{r projection20, results='asis', tidy=FALSE} plot(getSankey(colData(muraro)$cell_type1, muraro_to_seger$scmap_cluster_labs[,1], plot_height=400)) ``` @@ -189,7 +189,7 @@ __Exercise__ How many of the previously unclassified cells would be be able to assign to cell-types using scmap? __Answer__ -```{r, include=FALSE} +```{r projection21, include=FALSE} t2 <- table(colData(segerstolpe)$cell_type1, seger_to_muraro$scmap_cluster_labs) seger_assigned <- sum(t2[c("unclassified endocrine"), 1:9]) ``` @@ -201,7 +201,7 @@ cells). However, this process is stochastic so we must fix the random seed to en We have already performed feature selection for this dataset so we can go straight to building the index. -```{r} +```{r projection22} set.seed(193047) segerstolpe <- indexCell(segerstolpe) muraro <- indexCell(muraro) @@ -213,12 +213,12 @@ number of clusters and the number of features used in each of these subclusterin dataset with the same or most similar pattern of cluster assignments. We can examine the cluster assignment patterns for the reference datasets using: -```{r} +```{r projection23} metadata(muraro)$scmap_cell_index$subclusters[1:5,1:5] ``` To project and find the `w` nearest neighbours we use a similar command as before: -```{r} +```{r projection24} muraro_to_seger <- scmapCell( projection = muraro, index_list = list( @@ -229,12 +229,12 @@ muraro_to_seger <- scmapCell( ``` We can again look at the results: -```{r} +```{r projection25} muraro_to_seger$seger[[1]][,1:5] ``` This shows the column number of the 5 nearest neighbours in segerstolpe to each of the cells in muraro. We could then calculate a pseudotime estimate, branch assignment, or other cell-level data by selecting the appropriate data from the colData of the segerstolpe data set. As a demonstration we will find the cell-type of the nearest neighbour of each cell. -```{r} +```{r projection26} cell_type_NN <- colData(segerstolpe)$cell_type1[muraro_to_seger$seger[[1]][1,]] head(cell_type_NN) ``` @@ -247,14 +247,14 @@ versions. First is a fully supervised method which assumes cell-types are known Metaneighbour compares cell-types across datasets by building a cell-cell spearman correlation network. The method then tries to predict the label of each cell through weighted "votes" of its nearest-neighbours. Then scores the overall similarity between two clusters as the AUROC for assigning cells of typeA to typeB based on these weighted votes. AUROC of 1 would indicate all the cells of typeA were assigned to typeB before any other cells were, and an AUROC of 0.5 is what you would get if cells were being randomly assigned. Metanighbour is just a couple of R functions not a complete package so we have to load them using `source` -```{r} +```{r projection27} source("utils/2017-08-28-runMN-US.R") ``` #### Prepare Data Metaneighbour requires all datasets to be combined into a single expression matrix prior to running: -```{r} +```{r projection28} is.common <- rowData(muraro)$feature_symbol %in% rowData(segerstolpe)$feature_symbol muraro <- muraro[is.common,] segerstolpe <- segerstolpe[match(rowData(muraro)$feature_symbol, rowData(segerstolpe)$feature_symbol),] @@ -274,13 +274,13 @@ rownames(pheno) <- colnames(combined_logcounts) Metaneighbor includes a feature selection method to identify highly variable genes. -```{r} +```{r projection29} var.genes = get_variable_genes(combined_logcounts, pheno) ``` Since Metaneighbor is much slower than `scmap`, we will down sample these datasets. -```{r} +```{r projection30} subset <- sample(1:nrow(pheno), 2000) combined_logcounts <- combined_logcounts[,subset] pheno <- pheno[subset,] @@ -292,7 +292,7 @@ Now we are ready to run Metaneighbor. First we will run the unsupervised version that will let us see which cell-types are most similar across the two datasets. -```{r} +```{r projection31} unsup <- run_MetaNeighbor_US(var.genes, combined_logcounts, unique(pheno$Celltype), pheno) heatmap(unsup) ``` @@ -306,22 +306,22 @@ Learning the biological/techncial effects is done with either singular value dec mnnCorrect also assumes you've already subset your expression matricies so that they contain identical genes in the same order, fortunately we have already done with for our datasets when we set up our data for Metaneighbor. -```{r, echo=FALSE, merged_seurat.width = '80%', fig.cap="mnnCorrect batch/dataset effect correction. From Haghverdi et al. 2017"} +```{r projection32, echo=FALSE, merged_seurat.width = '80%', fig.cap="mnnCorrect batch/dataset effect correction. From Haghverdi et al. 2017"} knitr::include_graphics("figures/mnnCorrectDiagramCropped.png") ``` -```{r} +```{r projection33} require("scran") # mnnCorrect will take several minutes to run corrected <- mnnCorrect(logcounts(muraro), logcounts(segerstolpe), k=20, sigma=1, pc.approx=TRUE, subset.row=var.genes, svd.dim=3) ``` First let's check that we found a sufficient number of mnn pairs, mnnCorrect returns a list of dataframe with the mnn pairs for each dataset. -```{r} +```{r projection34} dim(corrected$pairs[[1]]) # muraro -> others dim(corrected$pairs[[2]]) # seger -> others ``` The first and second columns contain the cell column IDs and the third column contains a number indicating which dataset/batch the column 2 cell belongs to. In our case, we are only comparing two datasets so all the mnn pairs have been assigned to the second table and the third column contains only ones -```{r} +```{r projection35} head(corrected$pairs[[2]]) total_pairs <- nrow(corrected$pairs[[2]]) n_unique_seger <- length(unique((corrected$pairs[[2]][,1]))) @@ -334,7 +334,7 @@ __Exercise__ Which cell-types had mnns across these datasets? Should we increase/decrease k? __Answer__ -```{r, include=FALSE, eval=FALSE} +```{r projection36, include=FALSE, eval=FALSE} seger_matched_cells <- unique((corrected$pairs[[2]][,1])) summary(factor(colData(segerstolpe)$cell_type1[seger_matched_cells])) @@ -344,7 +344,7 @@ summary(factor(colData(muraro)$cell_type1[muraro_matched_cells])) Now we could create a combined dataset to jointly analyse these data. However, the corrected data is no longer counts and usually will contain negative expression values thus some analysis tools may no longer be appropriate. For simplicity let's just plot a joint TSNE. -```{r} +```{r projection37} require("Rtsne") joint_expression_matrix <- cbind(corrected$corrected[[1]], corrected$corrected[[2]]) @@ -365,7 +365,7 @@ Seurat uses gene-gene correlations to identify the biological structure in the d Note because Seurat uses up a lot of library space you will have to restart your R-session to load it, and the plots/merged_seuratput won't be automatically generated on this page. Reload the data: -```{r, eval=FALSE} +```{r projection38, eval=FALSE} muraro <- readRDS("data/pancreas/muraro.rds") segerstolpe <- readRDS("data/pancreas/segerstolpe.rds") segerstolpe <- segerstolpe[,colData(segerstolpe)$cell_type1 != "unclassified"] @@ -382,7 +382,7 @@ identical(rownames(segerstolpe), rownames(muraro)) First we will reformat our data into Seurat objects: -```{r, eval=FALSE} +```{r projection39, eval=FALSE} require("Seurat") set.seed(4719364) muraro_seurat <- CreateSeuratObject(raw.data=assays(muraro)[["normcounts"]]) # raw counts aren't available for muraro @@ -394,7 +394,7 @@ seger_seurat@meta.data[, "dataset"] <- 2 seger_seurat@meta.data[, "celltype"] <- paste("s",colData(segerstolpe)$cell_type1, sep="-") ``` Next we must normalize, scale and identify highly variable genes for each dataset: -```{r, eval=FALSE} +```{r projection40, eval=FALSE} muraro_seurat <- NormalizeData(object=muraro_seurat) muraro_seurat <- ScaleData(object=muraro_seurat) muraro_seurat <- FindVariableGenes(object=muraro_seurat, do.plot=TRUE) @@ -404,22 +404,22 @@ seger_seurat <- ScaleData(object=seger_seurat) seger_seurat <- FindVariableGenes(object=seger_seurat, do.plot=TRUE) ``` -```{r, echo=FALSE, merged_seurat.width = '80%', fig.cap="muraro variable genes"} +```{r projection41, echo=FALSE, merged_seurat.width = '80%', fig.cap="muraro variable genes"} knitr::include_graphics("figures/muraro_seurat_hvg.png") ``` -```{r, echo=FALSE, merged_seurat.width = '80%', fig.cap="segerstolpe variable genes"} +```{r projection42, echo=FALSE, merged_seurat.width = '80%', fig.cap="segerstolpe variable genes"} knitr::include_graphics("figures/seger_seurat_hvg.png") ``` Eventhough Seurat corrects for the relationship between dispersion and mean expression, it doesn't use the corrected value when ranking features. Compare the results of the command below with the results in the plots above: -```{r, eval=FALSE} +```{r projection43, eval=FALSE} head(muraro_seurat@hvg.info, 50) head(seger_seurat@hvg.info, 50) ``` But we will follow their example and use the top 2000 most dispersed genes withmerged_seurat correcting for mean expression from each dataset anyway. -```{r, eval=FALSE} +```{r projection44, eval=FALSE} gene.use <- union(rownames(x = head(x = muraro_seurat@hvg.info, n = 2000)), rownames(x = head(x = seger_seurat@hvg.info, n = 2000))) @@ -429,7 +429,7 @@ __Exercise__ Find the features we would use if we selected the top 2000 most dispersed after scaling by mean. (Hint: consider the `order` function) __Answer__ -```{r, include=FALSE, eval=FALSE} +```{r projection45, include=FALSE, eval=FALSE} seger_hvg <- seger_seurat@hvg.info seger_hvg <- seger_hvg[order(seger_hvg[,3], decreasing=TRUE),] muraro_hvg <- seger_seurat@hvg.info @@ -442,16 +442,16 @@ Now we will run CCA to find the shared correlation structure for these two datas Note to speed up the calculations we will be using only the top 5 dimensions but ideally you would consider many more and then select the top most informative ones using `DimHeatmap`. -```{r, eval=FALSE} +```{r projection46, eval=FALSE} merged_seurat <- RunCCA(object=muraro_seurat, object2=seger_seurat, genes.use=gene.use, add.cell.id1="m", add.cell.id2="s", num.cc = 5) DimPlot(object = merged_seurat, reduction.use = "cca", group.by = "dataset", pt.size = 0.5) # Before correcting ``` -```{r, echo=FALSE, merged_seurat.width = '80%', fig.cap="Before Aligning"} +```{r projection47, echo=FALSE, merged_seurat.width = '80%', fig.cap="Before Aligning"} knitr::include_graphics("figures/cca_before.png") ``` To identify dataset specific cell-types we compare how well cells are 'explained' by CCA vs dataset-specific principal component analysis. -```{r, eval=FALSE} +```{r projection48, eval=FALSE} merged_seurat <- CalcVarExpRatio(object = merged_seurat, reduction.type = "pca", grouping.var = "dataset", dims.use = 1:5) merged.all <- merged_seurat merged_seurat <- SubsetData(object=merged_seurat, subset.name="var.ratio.pca", accept.low = 0.5) # CCA > 1/2 as good as PCA @@ -461,12 +461,12 @@ summary(factor(merged.discard@meta.data$celltype)) # check the cell-type of the ``` Here we can see that despite both datasets containing endothelial cells, almost all of them have been discarded as "dataset-specific". Now we can align the datasets: -```{r, eval=FALSE} +```{r projection49, eval=FALSE} merged_seurat <- AlignSubspace(object = merged_seurat, reduction.type = "cca", grouping.var = "dataset", dims.align = 1:5) DimPlot(object = merged_seurat, reduction.use = "cca.aligned", group.by = "dataset", pt.size = 0.5) # After aligning subspaces ``` -```{r, echo=FALSE, merged_seurat.width = '80%', fig.cap="After Aligning"} +```{r projection50, echo=FALSE, merged_seurat.width = '80%', fig.cap="After Aligning"} knitr::include_graphics("figures/cca_after.png") ``` @@ -474,12 +474,12 @@ __Exercise__ Compare the results for if you use the features after scaling dispersions. __Answer__ -```{r, eval=FALSE, include=FALSE} +```{r projection51, eval=FALSE, include=FALSE} merged_seurat <- RunCCA(object=muraro_seurat, object2=seger_seurat, genes.use=gene.use.scaled, add.cell.id1="m", add.cell.id2="s", num.cc = 5) merged_seurat <- AlignSubspace(object = merged_seurat, reduction.type = "cca", grouping.var = "dataset", dims.align = 1:5) DimPlot(object = merged_seurat, reduction.use = "cca.aligned", group.by = "dataset", pt.size = 0.5) # After aligning subspaces ``` -```{r, echo=FALSE, merged_seurat.width = '80%', fig.cap="After Aligning"} +```{r projection52, echo=FALSE, merged_seurat.width = '80%', fig.cap="After Aligning"} knitr::include_graphics("figures/cca_after2.png") ``` @@ -491,6 +491,6 @@ Use the clustering methods we previously covered on the combined datasets. Do yo ### sessionInfo() -```{r echo=FALSE} +```{r projection53, echo=FALSE} sessionInfo() ``` diff --git a/course_files/pseudotime.Rmd b/course_files/pseudotime.Rmd index 69dc9dbb2..6097d5894 100644 --- a/course_files/pseudotime.Rmd +++ b/course_files/pseudotime.Rmd @@ -5,11 +5,11 @@ output: html_document ## Pseudotime analysis -```{r, echo=FALSE} +```{r pseudotime0, echo=FALSE} library(knitr) -opts_chunk$set(fig.align = "center") +opts_chunk$set(cache = TRUE, fig.align = "center") ``` -```{r, echo=TRUE, message=FALSE, warning=FALSE} +```{r pseudotime1, echo=TRUE, message=FALSE, warning=FALSE} library(SingleCellExperiment) library(TSCAN) library(M3Drop) @@ -73,11 +73,11 @@ gold standard of open-source software hosted in a reputable repository. The following figures from the paper summarise some of the features of the various tools. -```{r pseudotime-methods-description, out.width = '90%', fig.cap="Descriptions of trajectory inference methods for single-cell transcriptomics data (Fig. 2 from Cannoodt et al, 2016).", echo=FALSE} +```{r pseudotime2, out.width = '90%', fig.cap="Descriptions of trajectory inference methods for single-cell transcriptomics data (Fig. 2 from Cannoodt et al, 2016).", echo=FALSE} knitr::include_graphics("figures/cannoodt_pseudotime_properties.png") ``` -```{r pseudotime-methods, out.width = '90%', fig.cap="Characterization of trajectory inference methods for single-cell transcriptomics data (Fig. 3 from Cannoodt et al, 2016).", echo=FALSE} +```{r pseudotime3, out.width = '90%', fig.cap="Characterization of trajectory inference methods for single-cell transcriptomics data (Fig. 3 from Cannoodt et al, 2016).", echo=FALSE} knitr::include_graphics("figures/cannoodt_pseudotime_methods.png") ``` @@ -86,7 +86,7 @@ knitr::include_graphics("figures/cannoodt_pseudotime_methods.png") Let us take a first look at the Deng data, without yet applying sophisticated pseudotime methods. As the plot below shows, simple PCA does a very good job of displaying the structure in these data. It is only once we reach the blast cell types ("earlyblast", "midblast", "lateblast") that PCA struggles to separate the distinct cell types. -```{r data-overview} +```{r pseudotime4} deng_SCE <- readRDS("data/deng/deng-reads.rds") deng_SCE$cell_type2 <- factor( deng_SCE$cell_type2, @@ -103,7 +103,7 @@ plotPCA(deng_SCE, colour_by = "cell_type2") PCA, here, provides a useful baseline for assessing different pseudotime methods. For a very naive pseudotime we can just take the co-ordinates of the first principal component. -```{r pca-pseudotime} +```{r pseudotime5} deng_SCE$PC1 <- reducedDim(deng_SCE, "PCA")[,1] ggplot(as.data.frame(colData(deng_SCE)), aes(x = PC1, y = cell_type2, colour = cell_type2)) + @@ -124,7 +124,7 @@ TSCAN combines clustering with pseudotime analysis. First it clusters the cells First we will try to use all genes to order the cells. -```{r tscan-all-genes} +```{r pseudotime6} procdeng <- TSCAN::preprocess(deng) colnames(procdeng) <- 1:ncol(deng) dengclust <- TSCAN::exprmclust(procdeng, clusternum = 10) @@ -140,7 +140,7 @@ Frustratingly, TSCAN only provides pseudotime values for 221 of 268 cells, silen Again, we examine which timepoints have been assigned to each state: -```{r tscan-vs-truth} +```{r pseudotime7} cellLabels[dengclust$clusterid == 10] ggplot(as.data.frame(colData(deng_SCE)), aes(x = pseudotime_order_tscan, @@ -169,7 +169,7 @@ defined as a separate cell state. Unfortunately, Monocle does not work when all the genes are used, so we must carry out feature selection. First, we use M3Drop: -```{r m3d-select-genes} +```{r pseudotime8, m3d-select-genes} m3dGenes <- as.character( M3DropFeatureSelection(deng)$Gene ) @@ -178,7 +178,7 @@ d <- d[!duplicated(rownames(d)), ] ``` Now run monocle: -```{r monocle-all-genes, message=FALSE, warning=FALSE} +```{r pseudotime9, message=FALSE, warning=FALSE} colnames(d) <- 1:ncol(d) geneNames <- rownames(d) rownames(d) <- 1:nrow(d) @@ -206,7 +206,7 @@ pseudotime_order_monocle <- ``` We can again compare the inferred pseudotime to the known sampling timepoints. -```{r monocle-vs-truth} +```{r pseudotime10} deng_SCE$pseudotime_monocle <- pseudotime_monocle$pseudotime ggplot(as.data.frame(colData(deng_SCE)), aes(x = pseudotime_monocle, @@ -229,7 +229,7 @@ Monocle - at least with its default settings - performs poorly on these data. Th We will take the ranko prder of cells in the first diffusion map component as "diffusion map pseudotime" here. -```{r destiny-deng} +```{r pseudotime11} deng <- logcounts(deng_SCE) colnames(deng) <- cellLabels dm <- DiffusionMap(t(deng)) @@ -281,7 +281,7 @@ determine which value of "k" (number of nearest neighbours) yields an embedding most resembles a trajectory. Then we estimate the [locally linear embedding](https://en.wikipedia.org/wiki/Nonlinear_dimensionality_reduction) of the cells. -```{r slicer-analyis, message=FALSE, warning=FALSE} +```{r pseudotime12, message=FALSE, warning=FALSE} library("lle") slicer_genes <- select_genes(t(deng)) k <- select_k(t(deng[slicer_genes,]), kmin = 30, kmax=60) @@ -299,7 +299,7 @@ blue. Here we show the graph computed using 10 nearest neighbours. Here, SLICER appears to detect one major trajectory with one branch. -```{r slicer-build-graph} +```{r pseudotime13} slicer_traj_graph <- conn_knn_graph(slicer_traj_lle, 10) plot(slicer_traj_graph, main = "Fully connected kNN graph from SLICER") ``` @@ -307,14 +307,14 @@ plot(slicer_traj_graph, main = "Fully connected kNN graph from SLICER") From this graph we can identify "extreme" cells that are candidates for start/end cells in the trajectory. -```{r slicer} +```{r pseudotime14} ends <- find_extreme_cells(slicer_traj_graph, slicer_traj_lle) start <- ends[1] ``` Having defined a start cell we can order the cells in the estimated pseudotime. -```{r} +```{r pseudotime15} pseudotime_order_slicer <- cell_order(slicer_traj_graph, start) branches <- assign_branches(slicer_traj_graph, start) @@ -333,7 +333,7 @@ We can again compare the inferred pseudotime to the known sampling timepoints. SLICER does not provide a pseudotime value per se, just an ordering of cells. -```{r slicer-vs-truth} +```{r pseudotime16} ggplot(as.data.frame(colData(deng_SCE)), aes(x = pseudotime_slicer, y = cell_type2, colour = cell_type2)) + @@ -374,7 +374,7 @@ With Ouija we can model genes as either exhibiting monotonic up or down regulati Here we can "cheat" a little and check that our selected marker genes do actually identify different timepoints of the differentiation process. -```{r ouija-response-type, fig.height=11} +```{r pseudotime17, fig.height=11} ouija_markers_down <- c("Dazl", "Rnf17", "Sycp3", "Fgf8", "Egfr", "Bmp5", "Bmp15", "Pou5f1") ouija_markers_up <- c("Creb3", "Gpx4", "Krt8", "Elf5", "Cdx2", @@ -400,7 +400,7 @@ In general, HMC will provide more accurate inference with approximately correct To help the Ouija model, we provide it with prior information about the strength of switches for up- and down-regulated genes. By setting switch strength to -10 for down-regulated genes and 10 for up-regulated genes with a prior strength standard deviation of 0.5 we are telling the model that we are confident about the expected behaviour of these genes across the differentiation process. -```{r ouija-fit, warning=FALSE, message=FALSE, result='hide'} +```{r pseudotime18, warning=FALSE, message=FALSE, result='hide'} options(mc.cores = parallel::detectCores()) response_type <- c(rep("switch", length(ouija_markers_down) + length(ouija_markers_up)), @@ -423,26 +423,26 @@ print(oui_vb) We can plot the gene expression over pseudotime along with the maximum a posteriori (MAP) estimates of the mean function (the sigmoid or Gaussian transient function) using the plot_expression function. -```{r ouija-plot-exprs} +```{r pseudotime19} plot_expression(oui_vb) ``` We can also visualise when in the trajectory gene regulation behaviour occurs, either in the form of the switch time or the peak time (for switch-like or transient genes) using the plot_switch_times and plot_transient_times functions: -```{r ouija-plot-switch-times} +```{r pseudotime20} plot_switch_times(oui_vb) plot_peak_times(oui_vb) ``` Identify metastable states using consistency matrices. -```{r ouija-consistency} +```{r pseudotime21} cmo <- consistency_matrix(oui_vb) plot_consistency(oui_vb) cell_classifications <- cluster_consistency(cmo) ``` -```{r ouija-pseudotime} +```{r pseudotime22} map_pst <- map_pseudotime(oui_vb) ouija_pseudotime <- data.frame(map_pst, cell_classifications) @@ -469,7 +469,7 @@ Ouija does quite well in the ordering of the cells here, although it can be sens Ouija identifies four metastable states here, which we might annotate as "zygote/2cell", "4/8/16 cell", "blast1" and "blast2". -```{r ouija-states} +```{r pseudotime23} ggplot(as.data.frame(colData(deng_SCE)), aes(x = as.factor(ouija_cell_class), y = pseudotime_ouija, colour = cell_type2)) + @@ -483,7 +483,7 @@ ggplot(as.data.frame(colData(deng_SCE)), A common analysis is to work out the regulation orderings of genes. For example, is gene A upregulated before gene B? Does gene C peak before the downregulation of gene D? Ouija answers these questions in terms of a Bayesian hypothesis test of whether the difference in regulation timing (either switch time or peak time) is significantly different to 0. This is collated using the gene_regulation function. -```{r ouija-regulation} +```{r pseudotime24} gene_regs <- gene_regulation(oui_vb) head(gene_regs) @@ -500,7 +500,7 @@ How do the trajectories inferred by TSCAN, Monocle, Diffusion Map, SLICER and Ou TSCAN and Diffusion Map methods get the trajectory the "wrong way round", so we'll adjust that for these comparisons. -```{r compare-results, fig.width=10} +```{r pseudotime25, fig.width=10} df_pseudotime <- as.data.frame( colData(deng_SCE)[, grep("pseudotime", colnames(colData(deng_SCE)))] ) @@ -535,28 +535,28 @@ SLICER that do not provide plotting functions. __Principal components__ -```{r Rhoa-pc1, message=FALSE} +```{r pseudotime26, message=FALSE} plotExpression(deng_SCE, "Rhoa", x = "PC1", colour_by = "cell_type2", show_violin = FALSE, show_smooth = TRUE) ``` __TSCAN__ -```{r Rhoa-tscan, message=FALSE, warning=FALSE} +```{r pseudotime27, message=FALSE, warning=FALSE} plotExpression(deng_SCE, "Rhoa", x = "pseudotime_order_tscan", colour_by = "cell_type2", show_violin = FALSE, show_smooth = TRUE) ``` __Monocle__ -```{r Rhoa-monocle, message=FALSE} +```{r pseudotime28, message=FALSE} plotExpression(deng_SCE, "Rhoa", x = "pseudotime_monocle", colour_by = "cell_type2", show_violin = FALSE, show_smooth = TRUE) ``` __Diffusion Map__ -```{r Rhoa-diff-map, message=FALSE} +```{r pseudotime29, message=FALSE} plotExpression(deng_SCE, "Rhoa", x = "pseudotime_diffusionmap", colour_by = "cell_type2", show_violin = FALSE, show_smooth = TRUE) @@ -564,14 +564,14 @@ plotExpression(deng_SCE, "Rhoa", x = "pseudotime_diffusionmap", __SLICER__ -```{r Rhoa-slicer, message=FALSE} +```{r pseudotime30, message=FALSE} plotExpression(deng_SCE, "Rhoa", x = "pseudotime_slicer", colour_by = "cell_type2", show_violin = FALSE, show_smooth = TRUE) ``` __Ouija__ -```{r Rhoa-ouija, message=FALSE} +```{r pseudotime31, message=FALSE} plotExpression(deng_SCE, "Rhoa", x = "pseudotime_ouija", colour_by = "cell_type2", show_violin = FALSE, show_smooth = TRUE) @@ -583,6 +583,6 @@ __Exercise 7__: Repeat the exercise using a subset of the genes, e.g. the set of ### sessionInfo() -```{r echo=FALSE} +```{r pseudotime32, echo=FALSE} sessionInfo() ``` diff --git a/course_files/remove-conf-reads.Rmd b/course_files/remove-conf-reads.Rmd index c7ea74062..13a681ab9 100644 --- a/course_files/remove-conf-reads.Rmd +++ b/course_files/remove-conf-reads.Rmd @@ -4,12 +4,12 @@ output: html_document ## Dealing with confounders (Reads) -```{r, echo=FALSE} +```{r remove-conf-read0, echo=FALSE} library(knitr) -opts_chunk$set(fig.align = 'center') +opts_chunk$set(cache = TRUE, fig.align = 'center') ``` -```{r, message=FALSE, warning=FALSE} +```{r remove-conf-read1, message=FALSE, warning=FALSE} library(scRNA.seq.funcs) library(RUVSeq) library(scater) @@ -31,7 +31,7 @@ reads.qc <- computeSumFactors(reads.qc, sizes = 15, clusters = qclust) reads.qc <- normalize(reads.qc) ``` -```{r, message=FALSE, warning=FALSE} +```{r remove-conf-read2, message=FALSE, warning=FALSE} ruvg <- RUVg(counts(reads.qc), erccs, k = 1) assay(reads.qc, "ruvg1") <- log2( t(t(ruvg$normalizedCounts) / colSums(ruvg$normalizedCounts) * 1e6) + 1 @@ -42,7 +42,7 @@ assay(reads.qc, "ruvg10") <- log2( ) ``` -```{r} +```{r remove-conf-read3} scIdx <- matrix(-1, ncol = max(table(reads.qc$individual)), nrow = 3) tmp <- which(reads.qc$individual == "NA19098") scIdx[1, 1:length(tmp)] <- tmp @@ -61,7 +61,7 @@ assay(reads.qc, "ruvs10") <- log2( ) ``` -```{r, eval = TRUE, message=FALSE, warning=FALSE} +```{r remove-conf-read4, eval = TRUE, message=FALSE, warning=FALSE} combat_data <- logcounts(reads.qc) mod_data <- as.data.frame(t(combat_data)) # Basic batch removal @@ -81,7 +81,7 @@ assay(reads.qc, "combat") <- ComBat( __Exercise 1__ -```{r, eval = TRUE, echo = FALSE, message=FALSE, warning=FALSE} +```{r remove-conf-read5, eval = TRUE, echo = FALSE, message=FALSE, warning=FALSE} assay(reads.qc, "combat_tf") <- ComBat( dat = t(mod_data), batch = factor(reads.qc$batch), @@ -91,7 +91,7 @@ assay(reads.qc, "combat_tf") <- ComBat( ) ``` -```{r} +```{r remove-conf-read6} do_mnn <- function(data.qc) { batch1 <- logcounts(data.qc[, data.qc$replicate == "r1"]) batch2 <- logcounts(data.qc[, data.qc$replicate == "r2"]) @@ -144,7 +144,7 @@ assay(reads.qc, "mnn") <- cbind(indi1, indi2, indi3) #) ``` -```{r} +```{r remove-conf-read7} glm_fun <- function(g, batch, indi) { model <- glm(g ~ batch + indi) model$coef[1] <- 0 # replace intercept with 0 to preserve reference batch. @@ -163,7 +163,7 @@ assay(reads.qc, "glm") <- corrected __Exercise 2__ -```{r, echo=FALSE} +```{r remove-conf-read8, echo=FALSE} glm_fun1 <- function(g, batch) { model <- glm(g ~ batch) model$coef[1] <- 0 # replace intercept with 0 to preserve reference batch. @@ -187,7 +187,7 @@ indi3 <- do_glm(reads.qc[, reads.qc$individual == "NA19239"]) assay(reads.qc, "glm_indi") <- cbind(indi1, indi2, indi3); ``` -```{r} +```{r remove-conf-read9} reads.qc.endog = reads.qc[endog_genes,] reads.qc.endog = runPCA(reads.qc.endog, exprs_values = 'logcounts', ncomponents = 20) pca <- as.matrix(reads.qc.endog@reducedDims@listData[["PCA"]]) @@ -203,7 +203,7 @@ plotReducedDim( ) ``` -```{r} +```{r remove-conf-read10} for(n in assayNames(reads.qc)) { tmp <- runPCA( reads.qc[endog_genes, ], @@ -221,7 +221,7 @@ for(n in assayNames(reads.qc)) { } ``` -```{r} +```{r remove-conf-read11} res <- list() for(n in assayNames(reads.qc)) { res[[n]] <- suppressWarnings(calc_cell_RLE(assay(reads.qc, n), erccs)) @@ -230,7 +230,7 @@ par(mar=c(6,4,1,1)) boxplot(res, las=2) ``` -```{r, message = FALSE} +```{r remove-conf-read12, message = FALSE} compare_kBET_results <- function(sce){ indiv <- unique(sce$individual) norms <- assayNames(sce) # Get all normalizations @@ -253,7 +253,7 @@ compare_kBET_results <- function(sce){ eff_debatching <- compare_kBET_results(reads.qc) ``` -```{r, message = FALSE} +```{r remove-conf-read13, message = FALSE} require("reshape2") require("RColorBrewer") # Plot results @@ -283,6 +283,6 @@ ggplot(dod, aes(Normalisation, Individual, fill=kBET)) + ggtitle("Effect of batch regression methods per individual") ``` -```{r echo=FALSE} +```{r remove-conf-read14, echo=FALSE} sessionInfo() ``` diff --git a/course_files/remove-conf.Rmd b/course_files/remove-conf.Rmd index 86060f03c..d5f575030 100644 --- a/course_files/remove-conf.Rmd +++ b/course_files/remove-conf.Rmd @@ -18,12 +18,12 @@ Given the issues with using spike-ins, better results can often be obtained by u We explore both general approaches below. -```{r, echo=FALSE} +```{r remove-conf0, echo=FALSE} library(knitr) -opts_chunk$set(fig.align = 'center') +opts_chunk$set(cache= TRUE, fig.align = 'center') ``` -```{r, message=FALSE, warning=FALSE} +```{r remove-conf1, message=FALSE, warning=FALSE} library(scRNA.seq.funcs) library(RUVSeq) library(scater) @@ -66,7 +66,7 @@ We will concentrate on the first two approaches. #### RUVg -```{r, message=FALSE, warning=FALSE} +```{r remove-conf2, message=FALSE, warning=FALSE} ruvg <- RUVg(counts(umi.qc), erccs, k = 1) assay(umi.qc, "ruvg1") <- log2( t(t(ruvg$normalizedCounts) / colSums(ruvg$normalizedCounts) * 1e6) + 1 @@ -79,7 +79,7 @@ assay(umi.qc, "ruvg10") <- log2( #### RUVs -```{r} +```{r remove-conf3} scIdx <- matrix(-1, ncol = max(table(umi.qc$individual)), nrow = 3) tmp <- which(umi.qc$individual == "NA19098") scIdx[1, 1:length(tmp)] <- tmp @@ -101,7 +101,7 @@ assay(umi.qc, "ruvs10") <- log2( ### Combat If you have an experiment with a balanced design, `Combat` can be used to eliminate batch effects while preserving biological effects by specifying the biological effects using the `mod` parameter. However the `Tung` data contains multiple experimental replicates rather than a balanced design so using `mod1` to preserve biological variability will result in an error. -```{r message=FALSE, warning=FALSE, paged.print=FALSE} +```{r remove-conf4, message=FALSE, warning=FALSE, paged.print=FALSE} combat_data <- logcounts(umi.qc) mod_data <- as.data.frame(t(combat_data)) # Basic batch removal @@ -123,7 +123,7 @@ __Exercise 1__ Perform `ComBat` correction accounting for total features as a co-variate. Store the corrected matrix in the `combat_tf` slot. -```{r eval=TRUE, echo=FALSE, message=FALSE, warning=FALSE, paged.print=FALSE} +```{r remove-conf5, eval=TRUE, echo=FALSE, message=FALSE, warning=FALSE, paged.print=FALSE} assay(umi.qc, "combat_tf") <- ComBat( dat = t(mod_data), batch = factor(umi.qc$batch), @@ -137,7 +137,7 @@ assay(umi.qc, "combat_tf") <- ComBat( `mnnCorrect` [@Haghverdi2017-vh] assumes that each batch shares at least one biological condition with each other batch. Thus it works well for a variety of balanced experimental designs. However, the `Tung` data contains multiple replicates for each invidividual rather than balanced batches, thus we will normalized each individual separately. Note that this will remove batch effects between batches within the same individual but not the batch effects between batches in different individuals, due to the confounded experimental design. Thus we will merge a replicate from each individual to form three batches. -```{r} +```{r remove-conf6} do_mnn <- function(data.qc) { batch1 <- logcounts(data.qc[, data.qc$replicate == "r1"]) batch2 <- logcounts(data.qc[, data.qc$replicate == "r2"]) @@ -193,7 +193,7 @@ assay(umi.qc, "mnn") <- cbind(indi1, indi2, indi3) ### GLM A general linear model is a simpler version of `Combat`. It can correct for batches while preserving biological effects if you have a balanced design. In a confounded/replicate design biological effects will not be fit/preserved. Similar to `mnnCorrect` we could remove batch effects from each individual separately in order to preserve biological (and technical) variance between individuals. For demonstation purposes we will naively correct all cofounded batch effects: -```{r} +```{r remove-conf7} glm_fun <- function(g, batch, indi) { model <- glm(g ~ batch + indi) model$coef[1] <- 0 # replace intercept with 0 to preserve reference batch. @@ -214,7 +214,7 @@ __Exercise 2__ Perform GLM correction for each individual separately. Store the final corrected matrix in the `glm_indi` slot. -```{r, echo=FALSE} +```{r remove-conf8, echo=FALSE} glm_fun1 <- function(g, batch) { model <- glm(g ~ batch) model$coef[1] <- 0 # replace intercept with 0 to preserve reference batch. @@ -244,7 +244,7 @@ Harmony [Korsunsky2018fast] is a newer batch correction method, which is designe Seeing how the end result of Harmony is an altered dimensional reduction space created on the basis of PCA, we plot the obtained manifold here and exclude it from the rest of the follow-ups in the section. -```{r} +```{r remove-conf9} umi.qc.endog = umi.qc[endog_genes,] umi.qc.endog = runPCA(umi.qc.endog, exprs_values = 'logcounts', ncomponents = 20) pca <- as.matrix(umi.qc.endog@reducedDims@listData[["PCA"]]) @@ -272,7 +272,7 @@ corresponds to different biological samples (individuals). Separation of biologi interspersed batches indicates that technical variation has been removed. We always use log2-cpm normalized data to match the assumptions of PCA. -```{r} +```{r remove-conf10} for(n in assayNames(umi.qc)) { tmp <- runPCA( umi.qc[endog_genes, ], @@ -298,7 +298,7 @@ Consider different `ks` for RUV normalizations. Which gives the best results? We can also examine the effectiveness of correction using the relative log expression (RLE) across cells to confirm technical noise has been removed from the dataset. Note RLE only evaluates whether the number of genes higher and lower than average are equal for each cell - i.e. systemic technical effects. Random technical noise between batches may not be detected by RLE. -```{r} +```{r remove-conf11} res <- list() for(n in assayNames(umi.qc)) { res[[n]] <- suppressWarnings(calc_cell_RLE(assay(umi.qc, n), erccs)) @@ -313,7 +313,7 @@ Another method to check the efficacy of batch-effect correction is to consider t `kBET` [@Buttner2017-ds] takes `kNN` networks around random cells and tests the number of cells from each batch against a binomial distribution. The rejection rate of these tests indicates the severity of batch-effects still present in the data (high rejection rate = strong batch effects). `kBET` assumes each batch contains the same complement of biological groups, thus it can only be applied to the entire dataset if a perfectly balanced design has been used. However, `kBET` can also be applied to replicate-data if it is applied to each biological group separately. In the case of the Tung data, we will apply `kBET` to each individual independently to check for residual batch effects. However, this method will not identify residual batch-effects which are confounded with biological conditions. In addition, `kBET` does not determine if biological signal has been preserved. -```{r, message = FALSE} +```{r remove-conf12, message = FALSE} compare_kBET_results <- function(sce){ indiv <- unique(sce$individual) norms <- assayNames(sce) # Get all normalizations @@ -336,7 +336,7 @@ compare_kBET_results <- function(sce){ eff_debatching <- compare_kBET_results(umi.qc) ``` -```{r, message = FALSE} +```{r remove-conf13, message = FALSE} require("reshape2") require("RColorBrewer") # Plot results @@ -376,7 +376,7 @@ Perform the same analysis with read counts of the `tung` data. Use `tung/reads.r ### sessionInfo() -```{r echo=FALSE} +```{r remove-conf14, echo=FALSE} sessionInfo() ``` diff --git a/course_files/scater.Rmd b/course_files/scater.Rmd index b9e65ae4b..dac709cad 100644 --- a/course_files/scater.Rmd +++ b/course_files/scater.Rmd @@ -6,7 +6,7 @@ output: html_document ```{r scater0, echo=FALSE} library(knitr) -opts_chunk$set(fig.align = "center", cache=TRUE, cache.extra = list(R.version, sessionInfo())) +opts_chunk$set(cache = TRUE, fig.align = "center") set.seed(1) ``` diff --git a/course_files/search.Rmd b/course_files/search.Rmd index 6a9e1de49..92deee002 100644 --- a/course_files/search.Rmd +++ b/course_files/search.Rmd @@ -4,12 +4,12 @@ output: html_document ## Search scRNA-Seq data -```{r echo=FALSE, message=FALSE, warning=FALSE, paged.print=FALSE} +```{r Search0, echo=FALSE, message=FALSE, warning=FALSE, paged.print=FALSE} library(knitr) -opts_chunk$set(fig.align="center") +opts_chunk$set(cache=TRUE, fig.align="center") ``` -```{r, echo=TRUE, message=FALSE, warning=FALSE} +```{r Search1, echo=TRUE, message=FALSE, warning=FALSE} library(scfind) library(SingleCellExperiment) library(plotly) @@ -20,14 +20,14 @@ set.seed(1234567) `scfind` is a tool that allows one to search single cell RNA-Seq collections (Atlas) using lists of genes, e.g. searching for cells and cell-types where a specific set of genes are expressed. `scfind` is a [Github package](https://github.com/hemberg-lab/scfind). Cloud implementation of `scfind` with a large collection of datasets is available on our [website](https://scfind.sanger.ac.uk/). -```{r, echo = FALSE, out.width = '80%', fig.cap="scfind can be used to search large collection of scRNA-seq data by a list of gene IDs."} +```{r Search2, echo = FALSE, out.width = '80%', fig.cap="scfind can be used to search large collection of scRNA-seq data by a list of gene IDs."} knitr::include_graphics("figures/scfind.png") ``` ### Dataset We will run `scfind` on the Tabula Muris 10X dataset. `scfind` also operates on `SingleCellExperiment` class: -```{r, message=FALSE, warning=FALSE} +```{r Search3, message=FALSE, warning=FALSE} tm10x_heart <- readRDS("data/sce/Heart_10X.rds") tm10x_heart colData(tm10x_heart) @@ -36,7 +36,7 @@ colData(tm10x_heart) ### Gene Index Now we need to create a gene index using our dataset: -```{r} +```{r Search4} heart_index <- buildCellTypeIndex( sce = tm10x_heart, assay.name = "counts", @@ -51,7 +51,7 @@ The input matrix for indexing is the raw count matrix of the `SingleCellExperime The index can be saved in .rds format using `saveObject` function and loaded using `loadObject` function for future use. -```{r} +```{r Search5} tm10x_thymus <- readRDS("data/sce/Thymus_10X.rds") thymus_index <- buildCellTypeIndex( sce = tm10x_thymus, @@ -60,7 +60,6 @@ thymus_index <- buildCellTypeIndex( dataset.name = "Thymus" ) scfind_index <- mergeDataset(heart_index, thymus_index) - scfind_index@datasets cellTypeNames(scfind_index) sample(scfindGenes(scfind_index),20) @@ -68,7 +67,7 @@ sample(scfindGenes(scfind_index),20) To quickly and easily find the enriched cell type using an interactive Shiny application use the following method: -```{r eval=FALSE, message=FALSE, warning=FALSE, include=FALSE} +```{r Search6, eval=FALSE, message=FALSE, warning=FALSE, include=FALSE} scfindShiny(scfind_index) ``` @@ -76,15 +75,14 @@ scfindShiny(scfind_index) ### Marker genes Now let's find the marker genes for Thymus T cell in the datasets -```{r} +```{r Search7} # Showing the top 5 marker genes for each cell type and sort by F1 score. t_cell_markers <- cellTypeMarkers(scfind_index, cell.types = "Thymus.T cell", top.k = 5, sort.field = "f1") - t_cell_markers ``` Next, you can evaluate the markers of Thymus T cell in Thymus stromal cell -```{r} +```{r Search8} evaluateMarkers( scfind_index, gene.list = as.character(t_cell_markers$genes), @@ -93,12 +91,10 @@ evaluateMarkers( ) ``` -```{r} +```{r Search9} # By default, the marker evaluation takes all cell types in the dataset as background cell type, but you can use the argument `background.cell.types` to fine tune the evaluation - background <- cellTypeNames(scfind_index, datasets = "Thymus") background - evaluateMarkers( scfind_index, gene.list = as.character(t_cell_markers$genes), @@ -112,7 +108,7 @@ evaluateMarkers( ### Search cells by a gene list `scfind` can instantly identify the cell type that best represents the genes of interest from large single cell dataset. We will use the marker genes identified in an original publication [Yanbin et al. 2015](https://www.nature.com/articles/cr201599). Cardiomyocyte-specific markers used in immunostaining as shown in Figure 1. -```{r} +```{r Search10} cardiomyocytes <- c("Mef2c", "Gata4", "Nkx2.5", "Myh6", "tnnt2", "tnni3", "CDH2", "Cx43", "GJA1") result <- markerGenes( scfind_index, @@ -124,7 +120,7 @@ result To allow search of enriched cell type from a long list of gene query, `scfind` features a query optimization routin. First, the function `markerGenes` will counter suggest subqueries that with the highest support in the dataset. The TF-IDF score for each gene set allows user to identify the best subquery for finding the most relevant cell type. -```{r} +```{r Search11} best_subquery <- result[which.max(result$tfidf),] # get the best subquery by ranking TF-IDF score best_subquery <- strsplit(as.character(best_subquery$Query), ",")[[1]] # obtain gene list hyperQueryCellTypes( @@ -140,9 +136,8 @@ __Exercise 1__ Find the marker genes of all cell types in the Heart dataset -```{r, echo=FALSE} +```{r Search12, echo=FALSE} heart_cell_types <- cellTypeNames(scfind_index, "Heart") - for(cell_type_name in heart_cell_types) { result <- cellTypeMarkers( @@ -154,7 +149,7 @@ for(cell_type_name in heart_cell_types) } ``` -```{r} +```{r Search13} cardiac_contractility <- c("Ace2","Fkbp1b","Gh","Cacna1c","Cd59b","Ppp1r1a","Tnnt2","Nos1","Agtr1a","Camk2g","Grk2","Ins2","Dnah8","Igf1","Nos3","Nppa","Nppb","Il6","Myh6","Ren2","Tnni3","Apln","Kcnmb1","Pik3cg","Prkca","Aplnr","Slc8a1","Ace","Akt1","Edn1","Kcnmb2","Nos2","Tnf","Myh14","Adrb2","Agt","Adrb1","Atp2a2","Ryr2","Pln") ``` @@ -162,7 +157,7 @@ __Exercise 2__ Input the gene list relevant to "cardiac contractility" and find the best gene set with the highest support. Identify the enriched cell type for this query. -```{r, echo=FALSE} +```{r Search14, echo=FALSE} result <- markerGenes(scfind_index, cardiac_contractility) best_sub_query <- as.character(result[which.max(result$tfidf), c("Query")]) best_sub_query <- strsplit(best_sub_query, ",")[[1]] @@ -174,37 +169,31 @@ hyperQueryCellTypes(scfind_index, best_sub_query) Using the `findCellTypes` function, you can perform in-silico gating to identify cell type subsets as if the way cell sorting works. To do so, you can add logical operators including "-" and "*" for "no" and "intermediate" expression, respectively in front of the gene name. Here, we use operators to subset T cell of the Thymus dataset into effector T regulatory cells and effector memory T cell. -```{r} +```{r Search15} effector_t_reg_cells <- c("*Ptprc", "-Il7r", "Ctla4", "-Il7r") effector_memory_t_cells <- c("-Il2ra", "*Ptprc", "Il7r") - subset_treg <- findCellTypes(scfind_index, effector_t_reg_cells, "Thymus") subset_tmem <- findCellTypes(scfind_index, effector_memory_t_cells, "Thymus") - subset_treg subset_tmem ``` Let's use the TSNE plot information from the `SingleCellExperiment` of Thymus to illustrate the gating result -```{r} +```{r Search16} map <- data.frame( tm10x_thymus@reducedDims[['TSNE']], cell_type = as.character(colData(tm10x_thymus)$cell_type1), stringsAsFactors = F ) - map <- subset(map, cell_type == "T cell") - plot_ly(map, x = ~X1 , y = ~X2, type="scatter") - map$cell_type[subset_treg$`Thymus.T cell`] <- "Effector T Regulatory Cell" map$cell_type[subset_tmem$`Thymus.T cell`] <- "Effector Memory T Cell" - plot_ly(map, x = ~X1 , y = ~X2, type="scatter", color = ~cell_type) ``` ### sessionInfo() -```{r} +```{r Search17} sessionInfo() ``` diff --git a/course_files/seurat.Rmd b/course_files/seurat.Rmd index c8eff66a5..feb391621 100644 --- a/course_files/seurat.Rmd +++ b/course_files/seurat.Rmd @@ -2,12 +2,12 @@ output: html_document --- -```{r, echo=FALSE} +```{r seurat0, echo=FALSE} library(knitr) -opts_chunk$set(fig.align = "center") +opts_chunk$set(cache = TRUE, fig.align = "center") ``` -```{r, echo=TRUE, message=FALSE, warning=FALSE} +```{r seurat1, echo=TRUE, message=FALSE, warning=FALSE} set.seed(1234567) ``` @@ -25,7 +25,7 @@ We will be analyzing the a dataset of Peripheral Blood Mononuclear Cells (PBMC) We start by reading in the data. All features in Seurat have been configured to work with sparse matrices which results in significant memory and speed savings for Drop-seq/inDrop/10x data. -```{r} +```{r seurat2} library(Seurat) library(dplyr) library(cowplot) @@ -57,7 +57,7 @@ The steps below encompass the standard pre-processing workflow for scRNA-seq dat While the `CreateSeuratObject` imposes a basic minimum gene-cutoff, you may want to filter out cells at this stage based on technical or biological parameters. Seurat allows you to easily explore QC metrics and filter cells based on any user-defined criteria. In the example below, we visualize gene and molecule counts, plot their relationship, and exclude cells with a clear outlier number of genes detected as potential multiplets. Of course this is not a guaranteed method to exclude cell doublets, but we include this as an example of filtering user-defined outlier cells. We also filter cells based on the percentage of mitochondrial genes present. -```{r} +```{r seurat3} # The number of genes and UMIs (nGene and nUMI) are automatically calculated # for every object by Seurat. For non-UMI data, nUMI represents the sum of # the non-normalized values within a cell We calculate the percentage of @@ -80,7 +80,7 @@ pbmc$percent.mito <- percent.mito VlnPlot(object = pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mito"), ncol = 3) ``` -```{r} +```{r seurat4} # GenePlot is typically used to visualize gene-gene relationships, but can # be used for anything calculated by the object, i.e. columns in # object@meta.data, PC scores etc. Since there is a rare subset of cells @@ -91,7 +91,7 @@ FeatureScatter(object = pbmc, feature1 = "nCount_RNA", feature2 = "percent.mito" FeatureScatter(object = pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") ``` -```{r} +```{r seurat5} # We filter out cells that have unique gene counts (nFeature_RNA) over 2,500 or less than # 200 Note that > and < are used to define a'gate'. #-Inf and Inf should be used if you don't want a lower or upper threshold. @@ -102,7 +102,7 @@ pbmc <- subset(x = pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & per After removing unwanted cells from the dataset, the next step is to normalize the data. By default, we employ a global-scaling normalization method “LogNormalize” that normalizes the gene expression measurements for each cell by the total expression, multiplies this by a scale factor (10,000 by default), and log-transforms the result. -```{r} +```{r seurat6} pbmc <- NormalizeData(object = pbmc, normalization.method = "LogNormalize", scale.factor = 10000) ``` @@ -110,12 +110,12 @@ pbmc <- NormalizeData(object = pbmc, normalization.method = "LogNormalize", scal Seurat calculates highly variable genes and focuses on these for downstream analysis. `FindVariableGenes` calculates the average expression and dispersion for each gene, places these genes into bins, and then calculates a z-score for dispersion within each bin. This helps control for the relationship between variability and average expression. This function is unchanged from (Macosko et al.), but new methods for variable gene expression identification are coming soon. We suggest that users set these parameters to mark visual outliers on the dispersion plot, but the exact parameter settings may vary based on the data type, heterogeneity in the sample, and normalization strategy. The parameters here identify ~2,000 variable genes, and represent typical parameter settings for UMI data that is normalized to a total of 1e4 molecules. -```{r} +```{r seurat7} pbmc <- FindVariableFeatures(object = pbmc, mean.function = ExpMean, dispersion.function = LogVMR, x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5, nfeatures = 2000) ``` To view the output of the FindVariableFeatures output we use this function. The genes appear not to be stored in the object, but can be accessed this way. -```{r} +```{r seurat8} head(x = HVFInfo(object = pbmc)) ``` @@ -127,7 +127,7 @@ We can regress out cell-cell variation in gene expression driven by batch (if ap Seurat v2.0 implements this regression as part of the data scaling process. Therefore, the `RegressOut` function has been deprecated, and replaced with the vars.to.regress argument in `ScaleData`. -```{r} +```{r seurat9} pbmc <- ScaleData(object = pbmc, vars.to.regress = c("nCounts_RNA", "percent.mito")) ``` @@ -137,7 +137,7 @@ pbmc <- ScaleData(object = pbmc, vars.to.regress = c("nCounts_RNA", "percent.mit --> refered to Seurat v3 (latest): high variable features are accessed through the function HVFInfo(object). Despite RunPCA has a features argument where to specify the features to compute PCA on, I've been modifying its values and the output PCA graph has always the same dimensions, indicating that the provided genes in the features argument are not exactly the ones used to compute PCA. Wether the function gets the HVG directly or does not take them into account, I don't know. -```{r} +```{r seurat10} pbmc <- RunPCA(object = pbmc, npcs = 30, verbose = FALSE) ``` @@ -153,38 +153,38 @@ Seurat v3 provides functions for visualizing: - Violin and Ridge plots - Heatmaps -```{r} +```{r seurat11} # Examine and visualize PCA results a few different ways DimPlot(object = pbmc, reduction = "pca") ``` -```{r} +```{r seurat12} # Dimensional reduction plot, with cells colored by a quantitative feature FeaturePlot(object = pbmc, features = "MS4A1") ``` -```{r} +```{r seurat13} # Scatter plot across single cells, replaces GenePlot FeatureScatter(object = pbmc, feature1 = "MS4A1", feature2 = "PC_1") FeatureScatter(object = pbmc, feature1 = "MS4A1", feature2 = "CD3D") ``` -```{r} +```{r seurat14} # Scatter plot across individual features, repleaces CellPlot CellScatter(object = pbmc, cell1 = "AGTCTACTAGGGTG", cell2 = "CACAGATGGTTTCT") ``` -```{r} +```{r seurat15} VariableFeaturePlot(object = pbmc) ``` -```{r} +```{r seurat16} # Violin and Ridge plots VlnPlot(object = pbmc, features = c("LYZ", "CCL5", "IL32")) RidgePlot(object = pbmc, feature = c("LYZ", "CCL5", "IL32")) ``` In particular `DimHeatmap` allows for easy exploration of the primary sources of heterogeneity in a dataset, and can be useful when trying to decide which PCs to include for further downstream analyses. Both cells and genes are ordered according to their PCA scores. Setting cells.use to a number plots the ‘extreme’ cells on both ends of the spectrum, which dramatically speeds plotting for large datasets. Though clearly a supervised analysis, we find this to be a valuable tool for exploring correlated gene sets. -```{r} +```{r seurat17} # Heatmaps DimHeatmap(object = pbmc, reduction = "pca", cells = 200, balanced = TRUE) ``` @@ -197,7 +197,7 @@ To overcome the extensive technical noise in any single gene for scRNA-seq data, In [Macosko et al](http://www.cell.com/abstract/S0092-8674(15)00549-8), we implemented a resampling test inspired by the jackStraw procedure. We randomly permute a subset of the data (1% by default) and rerun PCA, constructing a ‘null distribution’ of gene scores, and repeat this procedure. We identify ‘significant’ PCs as those who have a strong enrichment of low p-value genes. -```{r} +```{r seurat18} # NOTE: This process can take a long time for big datasets, comment out for # expediency. More approximate techniques such as those implemented in # PCElbowPlot() can be used to reduce computation time @@ -206,17 +206,17 @@ pbmc <- JackStraw(object = pbmc, reduction = "pca", dims = 20, num.replicate = 1 The `JackStrawPlot` function provides a visualization tool for comparing the distribution of p-values for each PC with a uniform distribution (dashed line). ‘Significant’ PCs will show a strong enrichment of genes with low p-values (solid curve above the dashed line). In this case it appears that PCs 1-10 are significant. -```{r} +```{r seurat19} pbmc <- ScoreJackStraw(object = pbmc, dims = 1:20, reduction = "pca") ``` -```{r} +```{r seurat20} JackStrawPlot(object = pbmc, dims = 1:20, reduction = "pca") ``` A more ad hoc method for determining which PCs to use is to look at a plot of the standard deviations of the principle components and draw your cutoff where there is a clear elbow in the graph. This can be done with `ElbowPlot`. In this example, it looks like the elbow would fall around PC 5. -```{r} +```{r seurat21} ElbowPlot(object = pbmc) ``` @@ -231,7 +231,7 @@ Latest clustering results will be stored in object metadata under `seurat_cluste First calculate k-nearest neighbors and construct the SNN graph (`FindNeighbors`), then run `FindClusters`. -```{r} +```{r seurat22} pbmc <- FindNeighbors(pbmc, reduction = "pca", dims = 1:20) pbmc <- FindClusters(pbmc, resolution = 0.5, algorithm = 1) ``` @@ -240,7 +240,7 @@ pbmc <- FindClusters(pbmc, resolution = 0.5, algorithm = 1) Seurat continues to use tSNE as a powerful tool to visualize and explore these datasets. While we no longer advise clustering directly on tSNE components, cells within the graph-based clusters determined above should co-localize on the tSNE plot. This is because the tSNE aims to place cells with similar local neighborhoods in high-dimensional space together in low-dimensional space. As input to the tSNE, we suggest using the same PCs as input to the clustering analysis, although computing the tSNE based on scaled gene expression is also supported using the genes.use argument. -```{r} +```{r seurat23} pbmc <- RunTSNE(object = pbmc, dims.use = 1:10, do.fast = TRUE) # note that you can set do.label=T to help label individual clusters DimPlot(object = pbmc, reduction = "tsne") @@ -249,7 +249,7 @@ DimPlot(object = pbmc, reduction = "tsne") ## Run UMAP To visualize the two conditions side-by-side, we can use the split.by argument to show each condition colored by cluster. -```{r} +```{r seurat24} pbmc <- RunUMAP(pbmc, reduction = "pca", dims = 1:20) DimPlot(pbmc, reduction = "umap", split.by = "seurat_clusters") ``` @@ -257,7 +257,7 @@ DimPlot(pbmc, reduction = "umap", split.by = "seurat_clusters") You can save the object at this point so that it can easily be loaded back in without having to rerun the computationally intensive steps performed above, or easily shared with collaborators. -```{r} +```{r seurat25} saveRDS(pbmc, file = "data/pbmc_tutorial.rds") ``` @@ -267,19 +267,19 @@ Seurat can help you find markers that define clusters via differential expressio The `min.pct` argument requires a gene to be detected at a minimum percentage in either of the two groups of cells, and the thresh.test argument requires a gene to be differentially expressed (on average) by some amount between the two groups. You can set both of these to 0, but with a dramatic increase in time - since this will test a large number of genes that are unlikely to be highly discriminatory. As another option to speed up these computations, `max.cells.per.ident` can be set. This will downsample each identity class to have no more cells than whatever this is set to. While there is generally going to be a loss in power, the speed increases can be significiant and the most highly differentially expressed genes will likely still rise to the top. -```{r} +```{r seurat26} # find all markers of cluster 1 cluster1.markers <- FindMarkers(object = pbmc, ident.1 = 1, min.pct = 0.25) print(x = head(x = cluster1.markers, n = 5)) ``` -```{r} +```{r seurat27} # find all markers distinguishing cluster 5 from clusters 0 and 3 cluster5.markers <- FindMarkers(object = pbmc, ident.1 = 2, ident.2 = c(0, 3), min.pct = 0.25) print(x = head(x = cluster5.markers, n = 5)) ``` -```{r} +```{r seurat28} # find markers for every cluster compared to all remaining cells, report # only the positive ones pbmc.markers <- FindAllMarkers(object = pbmc, only.pos = TRUE, min.pct = 0.25, thresh.use = 0.25) @@ -288,7 +288,7 @@ pbmc.markers %>% group_by(cluster) %>% top_n(2, avg_logFC) Seurat has several tests for differential expression which can be set with the test.use parameter (see our [DE vignette](http://satijalab.org/seurat/de_vignette.html) for details). For example, the ROC test returns the ‘classification power’ for any individual marker (ranging from 0 - random, to 1 - perfect). -```{r} +```{r seurat29} cluster1.markers <- FindMarkers(object = pbmc, ident.1 = 0, thresh.use = 0.25, test.use = "roc", only.pos = TRUE) ``` @@ -300,18 +300,18 @@ We also suggest exploring: • `CellPlot`, and • `DotPlot` as additional methods to view your dataset. -```{r} +```{r seurat30} VlnPlot(object = pbmc, features =c("NKG7", "PF4")) ``` -```{r} +```{r seurat31} FeaturePlot(object = pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP", "CD8A"), cols = c("grey", "blue"), reduction = "tsne") ``` `DoHeatmap` generates an expression heatmap for given cells and genes. In this case, we are plotting the top 20 markers (or all markers if less than 20) for each cluster. -```{r} +```{r seurat32} top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(10, avg_logFC) # setting slim.col.label to TRUE will print just the cluster IDS instead of # every cell name @@ -322,7 +322,7 @@ DoHeatmap(object = pbmc, features = top10$gene, label = TRUE) Fortunately in the case of this dataset, we can use canonical markers to easily match the unbiased clustering to known cell types. -```{r} +```{r seurat33} current.cluster.ids <- c(0, 1, 2, 3, 4, 5, 6, 7) new.cluster.ids <- c("CD4 T cells", "CD14+ Monocytes", "B cells", "CD8 T cells", "FCGR3A+ Monocytes", "NK cells", "Dendritic cells", "Megakaryocytes") pbmc@active.ident <- plyr::mapvalues(x = pbmc@active.ident, from = current.cluster.ids, to = new.cluster.ids) @@ -333,7 +333,7 @@ DimPlot(object = pbmc, reduction = "tsne", do.label = TRUE, pt.size = 0.5) If you perturb some of our parameter choices above (for example, setting `resolution=0.8` or changing the number of PCs), you might see the CD4 T cells subdivide into two groups. You can explore this subdivision to find markers separating the two T cell subsets. However, before reclustering (which will overwrite `object@ident`), we can stash our renamed identities to be easily recovered later. -```{r} +```{r seurat34} # First lets stash our identities for later pbmc <- StashIdent(object = pbmc, save.name = "ClusterNames_0.6") @@ -342,7 +342,7 @@ pbmc <- StashIdent(object = pbmc, save.name = "ClusterNames_0.6") pbmc <- FindClusters(object = pbmc, reduction.type = "pca", dims.use = 1:10, resolution = 0.8, print.output = FALSE) ``` -```{r} +```{r seurat35} # Demonstration of how to plot two tSNE plots side by side, and how to color # points based on different criteria plot1 <- DimPlot(object = pbmc, reduction = "tsne", do.return = TRUE, no.legend = TRUE, do.label = TRUE) @@ -350,7 +350,7 @@ plot2 <- DimPlot(object = pbmc, reduction = "tsne", do.return = TRUE, group.by = plot_grid(plot1, plot2) ``` -```{r} +```{r seurat36} # Find discriminating markers tcell.markers <- FindMarkers(object = pbmc, ident.1 = 0, ident.2 = 1) @@ -363,13 +363,13 @@ FeaturePlot(object = pbmc, features = c("S100A4", "CCR7"), cols = c("green", "bl The memory/naive split is bit weak, and we would probably benefit from looking at more cells to see if this becomes more convincing. In the meantime, we can restore our old cluster identities for downstream processing. -```{r} +```{r seurat37} pbmc <- SetIdent(object = pbmc, value = "ClusterNames_0.6") saveRDS(pbmc, file = "data/pbmc3k_final.rds") ``` ## sessionInfo() -```{r echo=FALSE} +```{r seurat38, echo=FALSE} sessionInfo() ``` diff --git a/course_files/umis.Rmd b/course_files/umis.Rmd index 70343ce93..8a5b9b9d6 100644 --- a/course_files/umis.Rmd +++ b/course_files/umis.Rmd @@ -2,9 +2,9 @@ output: html_document --- -```{r umis0, echo=FALSE, cache=TRUE, cache.extra = list(R.version, sessionInfo())} +```{r Umis0, echo=FALSE} library(knitr) -opts_chunk$set(fig.align = "center", echo=FALSE, cache=TRUE, cache.extra = list(R.version, sessionInfo())) +opts_chunk$set(cache = TRUE, fig.align = "center", echo=FALSE) ``` ## Unique Molecular Identifiers (UMIs) {#umichapter} @@ -15,7 +15,7 @@ Thanks to Andreas Buness from [EMBL Monterotondo](https://www.embl.it/services/b Unique Molecular Identifiers are short (4-10bp) random barcodes added to transcripts during reverse-transcription. They enable sequencing reads to be assigned to individual transcript molecules and thus the removal of amplification noise and biases from scRNASeq data. -```{r umis1, out.width = '90%', fig.cap="UMI sequencing protocol"} +```{r Umis1, out.width = '90%', fig.cap="UMI sequencing protocol"} knitr::include_graphics("figures/UMI-Seq-protocol.png") ``` @@ -34,7 +34,7 @@ After processing the reads from a UMI experiment, the following conventions are 2. Reads are sorted into separate files by cell barcode + For extremely large, shallow datasets, the cell barcode may be added to the read name as well to reduce the number of files. -```{r umis2, out.width = '90%', fig.cap="UMI sequencing reads, red lightning bolts represent different fragmentation locations"} +```{r Umis2, out.width = '90%', fig.cap="UMI sequencing reads, red lightning bolts represent different fragmentation locations"} knitr::include_graphics("figures/UMI-Seq-reads.png") ``` @@ -51,7 +51,7 @@ In theory, every unique UMI-transcript pair should represent all reads originati 3. __Same UMI does not necessarily mean same molecule__ + Biases in UMI frequency and short UMIs can result in the same UMI being attached to different mRNA molecules from the same gene. Thus, the number of transcripts may be underestimated. -```{r umis3, out.width = '90%', fig.cap="Potential Errors in UMIs"} +```{r Umis3, out.width = '90%', fig.cap="Potential Errors in UMIs"} knitr::include_graphics("figures/UMI-Seq-errors.png") ``` @@ -70,7 +70,7 @@ where N = total number of unique UMI barcodes and n = number of observed barcode An important caveat of this method is that it assumes that all UMIs are equally frequent. In most cases this is incorrect, since there is often a bias related to the GC content. -```{r umis4, out.width = '60%', fig.cap="Per gene amplification rate"} +```{r Umis4, out.width = '60%', fig.cap="Per gene amplification rate"} knitr::include_graphics("figures/UMI-Seq-amp.png") ``` @@ -85,7 +85,7 @@ Determining how to best process and use UMIs is currently an active area of rese Current UMI platforms (DropSeq, InDrop, ICell8) exhibit low and highly variable capture efficiency as shown in the figure below. -```{r umis5, out.width = '70%', fig.cap="Variability in Capture Efficiency"} +```{r Umis5, out.width = '70%', fig.cap="Variability in Capture Efficiency"} knitr::include_graphics("figures/UMI-Seq-capture.png") ``` @@ -93,7 +93,7 @@ This variability can introduce strong biases and it needs to be considered in do __Exercise 1__ We have provided you with UMI counts and read counts from induced pluripotent stem cells generated from three different individuals [@Tung2017-ba] (see: Chapter \@ref(exprs-qc) for details of this dataset). -```{r umis6, echo=TRUE, include=TRUE} +```{r Umis6, echo=TRUE, include=TRUE} umi_counts <- read.table("data/tung/molecules.txt", sep = "\t") read_counts <- read.table("data/tung/reads.txt", sep = "\t") ``` @@ -103,7 +103,7 @@ Using this data: 2. Determine the amplification rate: average number of reads per UMI. -```{r umis7, include=FALSE} +```{r Umis7, include=FALSE} # Exercise 1 # Part 1 plot(colSums(umi_counts), colSums(umi_counts > 0), xlab="Total Molecules Detected", ylab="Total Genes Detected") diff --git a/main.nf b/main.nf index 2d0307344..575689fca 100644 --- a/main.nf +++ b/main.nf @@ -1,33 +1,54 @@ - Channel .fromPath("$baseDir/course_files", type: 'dir') - .into { ch_course_files1; ch_course_files2 } + .set { ch_course_files } + + +Channel + .fromPath('s3://scrnaseq-course/data/', checkIfExists: false) + .set { ch_data } Channel - .fromPath('s3://scrnaseq-course/data', checkIfExists: false) - .into { ch_data1; ch_data2 } + .fromPath('s3://scrnaseq-course/_bookdown_files/', checkIfExists: false) + .set { ch_cached_files } process html { - //publishDir 's3://scrnaseq-course', mode: 'copy' , overwrite: true + + echo true + input: - file fs from ch_course_files1 - file dat from ch_data1 + file 'course_dir_work/data' from ch_data + file 'course_dir' from ch_course_files + path '_bookdown_files' from ch_cached_files + output: - file website - script: - """ - cp -r $fs/* . + file 'course_dir_work/website' + + shell: + ''' + cp -r course_dir/* course_dir_work + cd course_dir_work + ln -s ../_bookdown_files . Rscript -e "bookdown::render_book('index.html', 'bookdown::gitbook')" - """ + ''' } +/* + process latex { - input: - file fs from ch_course_files2 - file dat from ch_data2 - script: - """ - cp -r $fs/* . - Rscript -e "bookdown::render_book('index.html', 'bookdown::pdf_book')" - """ + + echo true + + input: + file 'course_dir_work/data' from ch_data2 + file 'course_dir' from ch_course_files2 + path '_bookdown_files' from ch_cached_files + shell:: + ''' + cp -r course_dir/* course_dir_work + cd course_dir_work + ln -s ../_bookdown_files . + Rscript -e "bookdown::render_book('index.html', 'bookdown::gitbook')" + ``` } + +*/ diff --git a/nextflow.config b/nextflow.config index 975899ddb..122d51627 100644 --- a/nextflow.config +++ b/nextflow.config @@ -21,6 +21,7 @@ profiles { includeConfig 'conf/lsf.config' includeConfig 'conf/images.config' includeConfig 'conf/singularity.config' + includeConfig 'conf/sanger-singularity.config' } }