-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmerge_counts.R
87 lines (65 loc) · 2.07 KB
/
merge_counts.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/RScript
# htseq-combine.R
# copy this code to RStudio and adapt file locations to match yours
# Take 'gtf' htseq-count results and melt them in to one big dataframe
# where are we?
basedir <- "/work/Projects-BITS/PJ1502_WP_comparing-mappers/GIB-counts"
setwd(basedir)
cntdir <- paste(basedir, "htseq_counts", sep="/")
pat <- "-counts.txt"
htseq-counts <- list.files(path = cntdir,
pattern = pat,
gtf.files = TRUE,
recursive = FALSE,
ignore.case = FALSE,
include.dirs = FALSE)
myfiles <- htseq-counts
DT <- list()
# read each file as array element of DT and rename the last 2 cols
# we created a list of single sample tables
for (i in 1:length(myfiles) ) {
infile = paste(cntdir, myfiles[i], sep = "/")
DT[[myfiles[i]]] <- read.table(infile, header = F, stringsAsFactors = FALSE)
cnts <- gsub("(.*)_gtf_counts.txt", "\\1", myfiles[i])
colnames(DT[[myfiles[i]]]) <- c("ID", cnts)
}
# merge gtf elements based on first ID columns
data <- DT[[myfiles[1]]]
# inspect
head(data)
# we now add each other table with the ID column as key
for (i in 2:length(myfiles)) {
y <- DT[[myfiles[i]]]
z <- merge(data, y, by = c("ID"))
data <- z
}
# ID column becomes rownames
rownames(data) <- data$ID
data <- data[,-1]
## add total counts per sample
data <- rbind(data, tot.counts=colSums(data))
# remove no-count rows
data <- data[data$tot.counts>0,]
# inspect and look at the top row names!
head(data)
tail(data)
####################################
# take summary rows to a new table
# ( not starting with ENS with invert=TRUE )
# transpose table for readability
data.summary <- data[grep("^ENS", rownames(data), perl=TRUE, invert=TRUE), ]
# review
data.summary
# transpose table
t(data.summary)
# write summary to file
write.csv(data.summary, file = "htseq_counts-summary.csv")
####################################
# take gtf data rows to a new table
data.table <- data[grep("^ENS", rownames(data), perl=TRUE, invert=FALSE), ]
# final merged table
head(data.table, 3)
# write data to file
write.csv(data.table, file = "htseq_counts.csv")
# cleanup intermediate objects
rm(y, z, i, DT)