-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcondense_and_summarise_fastas.R
129 lines (96 loc) · 3.86 KB
/
condense_and_summarise_fastas.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#### redirect output and messages/errors to the log ####
args <- commandArgs(trailingOnly = TRUE)
#### get all the snakemake variables ####
maf2fasta_output <- args[1]
final_fasta <- args[2]
pair_summary_file <- args[3]
mask_fasta <- args[4]
# we need the target sequence name for a few things.
# because lastz munges names with dots in them, we just use the output
# file name, which has the {tchrom} on it.
tchrom = args[5]
#### Do the rest ####
library(tidyverse)
# read the sequences in. They are on every other line. The first
# is the target, the following are separate ones for each query chromosome
seqs <- read_lines(maf2fasta_output)
# break those sequences into a list of vectors
seq_vec_list <- str_split(seqs[seq(2, length(seqs), by = 2)], pattern = "")
snames <- seqs[seq(1, length(seqs), by = 2)]
rm(seqs)
# if there were multiple query sequences, condense them into a single one.
# the "-" has the lowest value of any of the possible letters in the aligned
# sequences, so this step takes the base at the aligned query sequence, if there
# is an aligned query sequency. (Remember we did single_cov2 so there will be only
# one aligned query at any point.)
if(length(seq_vec_list) > 2) {
anc <- do.call(pmax, seq_vec_list[-1])
} else {
anc <- seq_vec_list[[2]]
}
# now, we count up the number of different types of sites
pair_counts <- table(paste(seq_vec_list[[1]], anc))
# and make a tibble of those numbers
count_summary <- tibble(
name = names(pair_counts),
n = as.integer(pair_counts)
) %>%
separate(name, into = c("target", "ancestral"), sep = " ") %>%
mutate(chrom = tchrom, .before = target)
# and write that file out
write_csv(count_summary, file = pair_summary_file)
# and now, from anc, we subset out the sites that are "-"s in the target.
# That gives us an ancestral sequence that is congruent with the
# original target sequence. And then we replace the "-"'s in the ancestral
# seq with Ns
anc_fasta_seq <- anc[seq_vec_list[[1]] != "-"]
anc_fasta_seq[anc_fasta_seq == "-"] <- "N"
# We also create a fasta that masks the sites that are considered
# to be in repeat regions in either the target or the ancestral
# genome. This is done by marking any position that is not a
# capital letter A, C, G, or T
# in both sequences an N, and anything that is a capital lettter
# A, C, G, or T in both a P.
# Note that this also chucks sites that are designated by IUPAC codes
# in the query. Hmmm....
targ_seq <- seq_vec_list[[1]]
targ_seq <- targ_seq[targ_seq != "-"]
mask_seq = ifelse(
(targ_seq %in% c("A", "C", "G", "T")) & (anc_fasta_seq %in% c("A", "C", "G", "T")),
"P",
"N"
)
# now make a matrix out of those (anc_fasta_seq and mask_seq) to print them in lines.
# We have to extend each with NAs to be a length that is a multiple of 70 so that
# it doesn't get chopped off when we make a matrix of it for fast printing.
final_line_bits <- length(anc_fasta_seq) %% 70
if(final_line_bits != 0) {
length(anc_fasta_seq) <- length(anc_fasta_seq) + (70 - final_line_bits)
length(mask_seq) <- length(mask_seq) + (70 - final_line_bits)
}
# now, make a matrix of those and write them out. We ensure that
# the NA's in the last row of the matrix are just empty (i.e. ""'s).
# writing out the ancestral fasta
cat(">", tchrom, "\n", sep = "", file = final_fasta)
matrix(anc_fasta_seq, ncol = 70, byrow = TRUE) %>%
write.table(
file = final_fasta,
sep = "",
na = "",
quote = FALSE,
append = TRUE,
row.names = FALSE,
col.names = FALSE
)
# writing out the mask fasta
cat(">", tchrom, "\n", sep = "", file = mask_fasta)
matrix(mask_seq, ncol = 70, byrow = TRUE) %>%
write.table(
file = mask_fasta,
sep = "",
na = "",
quote = FALSE,
append = TRUE,
row.names = FALSE,
col.names = FALSE
)