-
Notifications
You must be signed in to change notification settings - Fork 52
/
Copy pathextract_tRNA.py
executable file
·33 lines (27 loc) · 1.05 KB
/
extract_tRNA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
#!/usr/bin/env
"""
Extract tRNA coordinates from GTF
"""
import sys
import GTF
import numpy as np
import pandas as pd
def main(GENCODE):
gc = GTF.dataframe(GENCODE)
gc.gene_id = gc.gene_id.replace(to_replace=r"\.[0-9]+", value="", regex=True)
idx = (gc.feature == "transcript") & gc.transcript_type.str.contains("tRNA")
tRNA = gc.ix[
idx, ["seqname", "start", "end", "transcript_id", "gene_name", "strand"]
]
tRNA.start = tRNA.start.astype(int)
tRNA.end = tRNA.end.astype(int)
tRNA.sort_values(by=["seqname", "start", "end"], inplace=True)
tRNA.to_csv("tRNA_transcripts.bed", sep="\t", header=False, index=False)
idx = (gc.feature == "gene") & gc.gene_type.str.contains("tRNA")
tRNA = gc.ix[idx, ["seqname", "start", "end", "gene_id", "gene_name", "strand"]]
tRNA.start = tRNA.start.astype(int)
tRNA.end = tRNA.end.astype(int)
tRNA.sort_values(by=["seqname", "start", "end"], inplace=True)
tRNA.to_csv("tRNA_genes.bed", sep="\t", header=False, index=False)
if __name__ == "__main__":
main(sys.argv[1])