-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathanalyse_cnv.R
137 lines (107 loc) · 3.76 KB
/
analyse_cnv.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# @DEPI rna_integrated_monocle.rds
# @DEPI metadata.rds
# @DEPO infercnv_output
library(monocle3)
library(infercnv)
library(testthat)
library(biomaRt)
library(tidyverse)
library(fs)
# Parameters --------------------------------------------------------------
# folder with input files
root_dir <- "data_generated"
# folder where infercnv results are saved
out_dir <- path_join(c(root_dir, "infercnv_output"))
# posterior probabilities for filtering normal regions
post_probs <- c(0.5, 0.4, 0.3, 0.2, 0.1)
# Load data ---------------------------------------------------------------
nb <- readRDS(path_join(c(root_dir, "rna_integrated_monocle.rds")))
nb_data <- readRDS(path_join(c(root_dir, "metadata.rds")))
# Prepare input data ------------------------------------------------------
# annotation of cells as reference or malignant
annotation_data <-
nb_data %>%
arrange(group, sample) %>%
transmute(
cell = cell,
cell_type = case_when(
cluster_50 == "8" ~ str_glue("malignant_{sample}_{group}"),
TRUE ~ as.character(cellont_name)
)
) %>%
column_to_rownames("cell")
expect_length(
intersect(colnames(counts(nb)), rownames(annotation_data)),
ncol(counts(nb))
)
# reference names
ref_group_names <-
annotation_data %>%
filter(!cell_type %>% str_starts("malignant")) %>%
pull(cell_type) %>%
unique()
# order of genes on chromosome
ensembl <- useEnsembl(
biomart = "genes",
dataset = "hsapiens_gene_ensembl",
version = 103
)
gene_order <-
getBM(
attributes = c(
"hgnc_symbol", "chromosome_name", "start_position", "end_position"
),
filters = "hgnc_symbol",
values = rownames(counts(nb)),
mart = ensembl
) %>%
as_tibble() %>%
filter(str_detect(chromosome_name, "^(\\d\\d?|X|Y)$")) %>%
mutate(chromosome_name = chromosome_name %>% as_factor() %>% fct_inseq()) %>%
arrange(chromosome_name, start_position) %>%
mutate(chromosome_name = str_glue("chr{chromosome_name}")) %>%
distinct(hgnc_symbol, .keep_all = TRUE) %>%
column_to_rownames("hgnc_symbol")
# Run infercnv ------------------------------------------------------------
infercnv_obj <- CreateInfercnvObject(
raw_counts_matrix = counts(nb), # |
annotations_file = annotation_data, # | inputs calculated above
ref_group_names = ref_group_names, # |
gene_order_file = gene_order, # |
chr_exclude = NULL, # genes have already been filtered
# max_cells_per_group = 50 # uncomment for testing
)
infercnv_obj <- infercnv::run(
infercnv_obj,
out_dir = out_dir,
cutoff = 0.1, # recommended for 10x data
cluster_by_groups = TRUE, # cluster observations by sample
denoise = TRUE, # denoise results
HMM = TRUE, # run HMM to predict CNV level
BayesMaxPNormal = post_probs[1], # use the first element of post_probs
no_plot = TRUE # disable (slow!) plotting
)
# Filter CNVs at different posterior probabilities ------------------------
# load data
mcmc_obj <- readRDS(
path_join(c(out_dir, "18_HMM_pred.Bayes_NetHMMi6.hmm_mode-samples.mcmc_obj"))
)
mcmc_obj@args$out_dir <- path_join(
c(out_dir, "BayesNetOutput.HMMi6.hmm_mode-samples")
)
infercnv_obj <- readRDS(
path_join(c(out_dir, "17_HMM_predHMMi6.hmm_mode-samples.infercnv_obj"))
)
# perform filtering for the remaining probability levels
walk(
post_probs[-1],
function(prob) {
res <- filterHighPNormals(mcmc_obj, [email protected], prob)
infercnv:::adjust_genes_regions_report(
res[[1]],
input_filename_prefix = "17_HMM_predHMMi6.hmm_mode-samples",
output_filename_prefix =
str_glue("HMM_CNV_predictions.HMMi6.hmm_mode-samples.Pnorm_{prob}"),
out_dir = out_dir)
}
)