-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsingleCellTMM.R
70 lines (65 loc) · 4.65 KB
/
singleCellTMM.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#' Calculate TMM normalization factors
#' @param x: a matrix of TPM values, for which normalization factors are calculated
#' @param normFactors: a vector of normalization factors to divide by if provided, otherwise runs calcTMMFactors
#' @param ...: other parameters to be passed to calcTMMFactors
#' @return the TMM normalized data
#' @examples
#' allTMM = normalizeTMM(allTPM, verbose=0);
normalizeTMM = function(x, normFactors=NULL, ...){
if (is.null(normFactors)){
normFactors = calcTMMFactors(x, ...);
}
for (c in 1: ncol(x)){
x[,c] = x[,c]/normFactors[c];
}
return(x);
}
#' Calculate TMM normalization factors
#' @param x: a matrix of TPM values, for which normalization factors are calculated
#' @param trimByExpr: the fraction of data points to remove from the lowly expressed and highly expressed (respectively) genes
#' @param trimByRatio: the fraction of data points to remove from either side of the log ratios of the cell/reference
#' @param referenceSample: the vector of gene expression values that are used as a reference sample. Defaults to the median of non-0 expression values
#' @param zeroExpression: the expression value is considered 0
#' @param verbose: verbosity of messaging; will report every nth cell for verbose>0
#' @return the normalization factors that each sample should be divided by
#' @examples
#' normFactors = calcTMMFactors(tpmMatrix);
calcTMMFactors = function(x, trimByExpr = c(0.3,0.1), trimByRatio = c(0.25,0.25), referenceSample=NULL, zeroExpression=0, verbose=0){
#x is a matrix of TPMs (genes x samples)
#get the median expression of each gene
if (is.null(referenceSample)){ # take the reference sample as the median of non-zero expression values in the data (TPM usually)
referenceSample = apply(x,1,function(y){median(y[y>zeroExpression], na.rm=T)});
referenceSample[is.na(referenceSample)]=0;
}
#for each cell, take only those genes that 1) are expressed 2) are somewhere in the "middle" of expression values and 3) have intermediate ratios
keepReference = referenceSample>zeroExpression; # non-zero
referenceBounds = quantile(referenceSample[keepReference], c(trimByExpr[1], 1-trimByExpr[2]), na.rm=T); #trim extremes of expression
keepReference = keepReference & referenceSample>=referenceBounds[1] & referenceSample<=referenceBounds[2];
#p = ggplot(data.frame(refExpr = c(referenceSample[referenceSample>zeroExpression],referenceSample[keepReference]),class=c(rep("all",sum(referenceSample>zeroExpression)), rep("used",sum(keepReference)))),aes(x=log2(refExpr+1),fill=class)) +geom_histogram(alpha=0.25, position="identity", bins=100) + theme_bw()+geom_vline(xintercept = log2(1+referenceBounds)); print(p)
if (verbose>0){
message(sprintf("Keeping %f%% of reference expression values.",100*mean(keepReference)))
}
normFactors = rep(1, ncol(x));
for (c in 1: ncol(x)){
keepCurCell = x[,c]>zeroExpression; #non zero for this cell
curExprBounds = quantile(x[keepCurCell,c], c(trimByExpr[1], 1-trimByExpr[2]), na.rm=T); #define extremes of expression
keepCurCell = keepCurCell & x[,c]>=curExprBounds[1] & x[,c]<=curExprBounds[2];
#p = ggplot(data.frame(cellExpr = c(x[x[,c]>zeroExpression,c],x[keepCurCell,c]),class=c(rep("all",sum(x[,c]>zeroExpression)), rep("used",sum(keepCurCell)))),aes(x=log2(cellExpr+1),fill=class)) +geom_histogram(alpha=0.25, position="identity", bins=100) + theme_bw()+geom_vline(xintercept = log2(1+curExprBounds)); print(p)
curLogRatios = log2(x[,c]/referenceSample); # get current log ratios for all datapoints
curRatioBounds = quantile(curLogRatios[keepCurCell & keepReference], c(trimByRatio[1], 1-trimByRatio[2]), na.rm=T); #define extremes of fold change
keepCurCell = keepCurCell & keepReference & curLogRatios>=curRatioBounds[1] & curLogRatios<=curRatioBounds[2];
keepCurCell[is.na(keepCurCell)]=F;
curOffset = mean(curLogRatios[keepCurCell], na.rm=T) # calculate the offset of this sample with the outlier-free data
#p = ggplot(data.frame(cellOverRef = c(curLogRatios,curLogRatios[keepCurCell]),class=c(rep("all",length(curLogRatios)), rep("used",sum(keepCurCell)))),aes(x=cellOverRef,fill=class)) +geom_histogram(alpha=0.25, position="identity", bins=100) + theme_bw()+geom_vline(xintercept = curRatioBounds); print(p)
if (verbose>0 && c %% verbose ==0){
message(sprintf("Cell %i is being centered by %i genes and is offset from the reference by %g (log2)",c,sum(keepCurCell), curOffset))
}
if (sum(keepCurCell)<50){
message(sprintf("WARNING: fewer than 50 genes used to center sample %i (#genes = %i)",c,sum(keepCurCell)));
}
if (sum(keepCurCell)>0){
normFactors[c] = (2^curOffset);
}
}
return(normFactors);
}