From ca0d2521ae3a07f8aef977b5f91c06923e58a2b1 Mon Sep 17 00:00:00 2001 From: DongzeHe Date: Fri, 6 Jan 2023 16:51:22 -0500 Subject: [PATCH 1/6] feat: add make spliceu txome function, and fix biopython Strand warning --- bin/pyroe | 75 ++- setup.cfg | 2 +- src/pyroe/__init__.py | 4 +- src/pyroe/make_txome.py | 1027 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 1104 insertions(+), 4 deletions(-) mode change 100644 => 100755 bin/pyroe create mode 100644 src/pyroe/make_txome.py diff --git a/bin/pyroe b/bin/pyroe old mode 100644 new mode 100755 index ab105ae..c4a4296 --- a/bin/pyroe +++ b/bin/pyroe @@ -2,7 +2,7 @@ import logging -from pyroe import make_splici_txome +from pyroe import make_splici_txome, make_spliceu_txome from pyroe import fetch_processed_quant from pyroe import convert from pyroe import id_to_name @@ -22,12 +22,15 @@ if __name__ == "__main__": parser.add_argument( "-v", "--version", action="version", version=f"pyroe {__version__}" ) + subparsers = parser.add_subparsers( title="subcommands", dest="command", description="valid subcommands", help="additional help", ) + + # make-splici parser_makeSplici = subparsers.add_parser( "make-splici", help="Make splici reference" ) @@ -101,6 +104,63 @@ if __name__ == "__main__": help="A flag indicates whether a clean gtf will be written if encountered invalid records.", ) + # make-spliceu + parser_makeSpliceu = subparsers.add_parser( + "make-spliceu", help="Make spliceu reference" + ) + parser_makeSpliceu.add_argument( + "genome_path", + metavar="genome-path", + type=str, + help="The path to a genome fasta file.", + ) + parser_makeSpliceu.add_argument( + "gtf_path", metavar="gtf-path", type=str, help="The path to a gtf file." + ) + parser_makeSpliceu.add_argument( + "output_dir", + metavar="output-dir", + type=str, + help="The output directory where Spliceu reference files will be written.", + ) + parser_makeSpliceu.add_argument( + "--filename-prefix", + type=str, + default="spliceu", + help="The file name prefix of the generated output files.", + ) + parser_makeSpliceu.add_argument( + "--extra-spliced", + type=str, + help="The path to an extra spliced sequence fasta file.", + ) + parser_makeSpliceu.add_argument( + "--extra-unspliced", + type=str, + help="The path to an extra unspliced sequence fasta file.", + ) + parser_makeSpliceu.add_argument( + "--bt-path", + type=str, + default="bedtools", + help="The path to bedtools v2.30.0 or greater.", + ) + parser_makeSpliceu.add_argument( + "--no-bt", + action="store_true", + help="A flag indicates whether bedtools will be used for generating Spliceu reference files.", + ) + parser_makeSpliceu.add_argument( + "--dedup-seqs", + action="store_true", + help="A flag indicates whether identical sequences will be deduplicated.", + ) + parser_makeSpliceu.add_argument( + "--write-clean-gtf", + action="store_true", + help="A flag indicates whether a clean gtf will be written if encountered invalid records.", + ) + # parse available datasets available_datasets = fetch_processed_quant() epilog = "\n".join( @@ -212,6 +272,19 @@ if __name__ == "__main__": no_flanking_merge=args.no_flanking_merge, write_clean_gtf=args.write_clean_gtf, ) + elif args.command == "make-spliceu": + make_spliceu_txome( + genome_path=args.genome_path, + gtf_path=args.gtf_path, + output_dir=args.output_dir, + filename_prefix=args.filename_prefix, + extra_spliced=args.extra_spliced, + extra_unspliced=args.extra_unspliced, + dedup_seqs=args.dedup_seqs, + no_bt=args.no_bt, + bt_path=args.bt_path, + write_clean_gtf=args.write_clean_gtf, + ) elif args.command == "fetch-quant": fetch_processed_quant( dataset_ids=args.dataset_ids, diff --git a/setup.cfg b/setup.cfg index 6155b22..8c8bee3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = pyroe -version = 0.6.4 +version = 0.7.0 author = Dongze He, Rob Patro author_email = dhe17@umd.edu, rob@cs.umd.edu description = utilities of alevin-fry diff --git a/src/pyroe/__init__.py b/src/pyroe/__init__.py index 89ce72b..3a7b831 100644 --- a/src/pyroe/__init__.py +++ b/src/pyroe/__init__.py @@ -1,7 +1,7 @@ -__version__ = "0.6.4" +__version__ = "0.7.0" from pyroe.load_fry import load_fry -from pyroe.make_splici_txome import make_splici_txome +from pyroe.make_txome import make_splici_txome, make_spliceu_txome from pyroe.fetch_processed_quant import fetch_processed_quant from pyroe.load_processed_quant import load_processed_quant from pyroe.ProcessedQuant import ProcessedQuant diff --git a/src/pyroe/make_txome.py b/src/pyroe/make_txome.py new file mode 100644 index 0000000..58dac41 --- /dev/null +++ b/src/pyroe/make_txome.py @@ -0,0 +1,1027 @@ +def append_extra(extra_infile, out_fa, out_t2g3col, id2name_path, col_status): + """ + extra_infile : str + path to the extra FASTA format input file from which to read the extra records + out_fa : str + path to the output FASTA file being created (will be *appended* to) + out_t2g3col : str + path to the output t2g 3-column file (will be *appended* to) + id2name_path : str + path to the output (existing) gene id to gene name tsv (will be *appended* to) + col_status : char + splicing status to use for these records (one of 'S' or 'U') + """ + + from Bio.SeqIO.FastaIO import SimpleFastaParser + + # write extra sequences to the t2g file + with open(extra_infile) as extra_in_fa: + with open(out_fa, "a") as splici_fa: + with open(out_t2g3col, "a") as t2g: + with open(id2name_path, "a") as id_to_name: + for title, sequence in SimpleFastaParser(extra_in_fa): + tid = title.split()[0] + splici_fa.write(f">{tid}\n") + splici_fa.write(f"{sequence}\n") + t2g.write(f"{tid}\t{tid}\t{col_status}\n") + id_to_name.write(f"{tid}\t{tid}\n") + + +def dedup_sequences(output_dir, in_fa, out_fa): + """ + This function deduplicates the fasta sequences in `in_fa`, writes the + deduplicated sequences in fasta format to `out_fa`, and also produces a + 2-column tsv file in `ouput_dir`/duplicate_entries.tsv recording the + entries that have been removed and the retained entry that they match. + Note: It is allowed (and in some cases intended) that `in_fa` == `out_fa`. + In this case, the input fasta file will be overwritten. + + output_dir : str + The output directory where duplicate_entries.tsv will be written + in_fa : str + The path to the input fasta file to be deduplicated + out_fa : str + The path to where the ouput deduplicated fasta file should be written + """ + import os + from Bio import SeqIO + from Bio.Seq import Seq + from Bio.SeqRecord import SeqRecord + + record_representatives = {} + + # read from the input fasta file and track + # the duplicate sequences + for record in SeqIO.parse(in_fa, "fasta"): + seq = str(record.seq) + name = record.id.split()[0] + # if we haven't seen it yet, this is the representative + # otherwise append the name of the duplicate + if seq in record_representatives: + record_representatives[seq].append(name) + else: + record_representatives[seq] = [name] + + # write out the duplicate entries in the same format + # the salmon / pufferfish indexer uses + dup_file_name = os.path.sep.join([output_dir, "duplicate_entries.tsv"]) + with open(dup_file_name, "w") as dup_file: + dup_file.write("RetainedRef\tDuplicateRef\n") + for k, vu in record_representatives.items(): + if len(vu) > 1: + v = sorted(vu) + for dup in v[1:]: + dup_file.write(f"{v[0]}\t{dup}\n") + + # write the deduplicated output to file + # Note: it might be that out_file == in_file and we + # are overwriting the input. + with open(out_fa, "w") as ofile: + for k, vu in record_representatives.items(): + v = sorted(vu) + rec = SeqRecord(Seq(k), id=v[0], description="") + SeqIO.write(rec, ofile, "fasta") + + +def check_gr(gr, output_dir, write_clean_gtf): + """ + This function checks the validity of a PyRanges object. + It will follow these rules: + 1. For gene type records, gene_id and gene_name cannot be both missing. + 2. All other types of records have to have a valid (not NaN) transcript_id. + 3. For all types of records other than gene type, + - If the gene_id and gene_name are both missing, then the transcript_id + will be used to impute them. + - If one of gene_id and gene_name is missing, then the other will be + used to impute the missing one. + + Args: + gr (pyranges): Stranded PyRanges object + """ + + import pandas as pd + import os + import pyranges as pr + import warnings + + # split gene type records with others + # we don't use gene records in splici construction + gene_gr = gr[gr.Feature == "gene"] + gr = gr[gr.Feature != "gene"] + clean_gtf_path = os.path.join(output_dir, "clean_gtf.gtf") + + # If required fields are missing, quit + if "transcript_id" not in gr.columns: + raise ValueError( + "The input GTF file doesn't contain transcript_id field; Cannot proceed." + ) + + if "gene_id" not in gr.columns: + # use gene_name as gene_id if exists, return an error otherwise + if "gene_name" not in gr.columns: + raise ValueError( + "The input GTF file doesn't contain gene_id and gene_name field; Cannot proceed." + ) + else: + warnings.warn("gene_id field does not exist, use gene_name as gene_id.") + gene_id = pd.Series(data=gr.gene_name, name="gene_id") + gr = gr.insert(gene_id) + + if "gene_name" not in gr.columns: + warnings.warn("gene_name field does not exist, use gene_id as gene_name.") + gene_name = pd.Series(data=gr.gene_id, name="gene_name") + gr = gr.insert(gene_name) + + # If there is any NaN in transcript_id field, quit + if sum(gr.transcript_id.isnull()): + + # first, write an clean GTF if needed + if write_clean_gtf: + gr = pr.concat([gene_gr, gr[gr.transcript_id.notnull()]]) + gr.to_gtf(clean_gtf_path) + # gr[gr.transcript_id.notnull()].to_gtf(clean_gtf_path) + clean_gtf_msg = f"An clean GTF file is written to {clean_gtf_path}." + else: + clean_gtf_msg = "Set the write_clean_gtf flag if a clean GTF without the invalid records is needed." + # Then, raise a value error + raise ValueError( + f"Found NaN value in exons' transcript ID; Cannot proceed.\n{clean_gtf_msg}" + ) + + # Impute missing gene_id and gene_name values + # define an object + num_nan = gr.df[["gene_id", "gene_name"]].isnull().sum(axis=1) + + if num_nan.sum(): + + # create intermediate df + gene_df = gr.df[["gene_id", "gene_name"]] + + # Firstly, write the problematic records to a file before imputation + problematic_gtf_path = os.path.join( + output_dir, "missing_gene_id_or_name_records.gtf" + ) + gr[num_nan > 0].to_gtf(problematic_gtf_path) + missing_record_msg = f"\nFound records with missing gene_id/gene_name field.\nThese records are reported in {problematic_gtf_path}." + + # impute using transcript_id + double_missing = num_nan == 2 + if double_missing.sum(): + gene_df.loc[num_nan == 2, "gene_id"] = gr.transcript_id[num_nan == 2] + gene_df.loc[num_nan == 2, "gene_name"] = gr.transcript_id[num_nan == 2] + double_missing_msg = f"\n - Found {(num_nan == 2).sum()} records missing gene_id and gene_name, imputing using transcript_id." + else: + double_missing_msg = "" + + # If one field is missing, impute using the other + # missing only gene_id + gene_id_missing = gene_df["gene_id"].isnull() + if gene_id_missing.sum(): + gene_df.loc[gene_df["gene_id"].isnull(), "gene_id"] = gene_df.loc[ + gene_df["gene_id"].isnull(), "gene_name" + ] + gene_id_missing_msg = f"\n - Found {gene_id_missing.sum()} records missing gene_id, imputing using gene_name." + else: + gene_id_missing_msg = "" + + gene_name_missing = gene_df["gene_name"].isnull() + if gene_name_missing.sum(): + gene_df.loc[gene_df["gene_name"].isnull(), "gene_name"] = gene_df.loc[ + gene_df["gene_name"].isnull(), "gene_id" + ] + gene_name_missing_msg = f"\n - Found {gene_name_missing.sum()} records missing gene_name, imputing using gene_id." + else: + gene_name_missing_msg = "" + + # write the warning message + warnings.warn( + "".join( + [ + missing_record_msg, + double_missing_msg, + gene_id_missing_msg, + gene_name_missing_msg, + ] + ) + ) + + # replace the old gene_id and gene_name fields using imputed one. + gr = gr.drop(["gene_id", "gene_name"]) + gr = gr.insert(gene_df) + # if gene_gr is used in the future, then concat them. + if write_clean_gtf: + clean_gr = pr.concat([gene_gr, gr]) + clean_gr.to_gtf(clean_gtf_path) + clean_gtf_msg = f"An clean GTF file is written to {clean_gtf_path}." + print(clean_gtf_msg) + + # gr = pr.concat([gr, gene_gr]) + + # return imputed gr + return gr + + +def make_splici_txome( + genome_path, + gtf_path, + read_length, + output_dir, + flank_trim_length=5, + filename_prefix="splici", + extra_spliced=None, + extra_unspliced=None, + dedup_seqs=False, + no_bt=False, + bt_path="bedtools", + no_flanking_merge=False, + write_clean_gtf=False, +): + """ + Construct the splici (spliced + introns) transcriptome for alevin-fry. + + Required Parameters + ---------- + genome_path : str + The path to a genome fasta file. + + gtf_path : str + The path to a gtf file. + + read_length : int + The read length of the single-cell experiment being processed. + + output_dir : str + The output directory, where the splici reference files will + be written. + + Optional Parameters + ---------- + + flank_trim_length : int (default: `5`) + The flank trimming length. + The final flank length is obtained by subtracting + the flank_trim_length from the read_length. + + filename_prefix : str (default: `splici`) + The file name prefix of the generated output files. + The derived flank length will be automatically + appended to the provided prefix. + + extra_spliced : str + A path to a fasta file. The records in this fasta file will be + regarded as spliced transcripts. + + extra_unspliced : str + The path to a fasta file. The records in this fasta file will be + regarded as introns. + + dedup_seqs : bool (default: `False`) + If True, the repeated sequences in the splici reference will be + deduplicated. + + no_bt : bool (default: `False`) + If true, biopython, instead of bedtools, will be used for + generating splici reference files. + + bt_path : str + The path to bedtools v2.30.0 or greater if it is not in the environment PATH. + + no_flanking_merge : bool (default: `False`) + If true, overlapping introns caused by the added flanking length will not be merged. + + write_clean_gtf : bool (default: `False`) + If true, when the input GTF contains invalid records, a clean GTF + file `clean_gtf.gtf` with these invalid records removed will be + exported to the output dir. + An invalid record is an exon record without an transcript ID + in the `transcript_id` field. + + + Returns + ------- + Nothing will be returned. The splici reference files will be written + to disk. + + Notes + ----- + * If using bedtools, a temp.bed and a temp.fa will be created and + then deleted. These two files encode the introns of each gene + and the exons of each transcript of each gene. + + """ + + import pyranges as pr + + import warnings + + import os + + import subprocess + import shutil + + from packaging.version import parse as parse_version + + from Bio import SeqIO + from Bio.SeqIO.FastaIO import SimpleFastaParser + + # Preparation + + # check flanking length + flank_length = read_length - flank_trim_length + + if flank_length < 0: + raise ValueError("Flank trim length cannot be larger than read length!") + + # check fasta file + if not os.path.isfile(genome_path): + raise IOError("Cannot open the input fasta file!") + + # check gtf file + if not os.path.isfile(gtf_path): + raise IOError("Cannot open the input gtf file!") + + def check_bedtools_version(bt_check_path): + try: + vstr = ( + subprocess.run([bt_path, "--version"], capture_output=True) + .stdout.decode() + .strip() + .split("v")[1] + ) + found_ver = parse_version(vstr) + req_ver = parse_version("2.30.0") + return found_ver >= req_ver + except subprocess.CalledProcessError as err: + # in this case couldn't even run subprocess + warnings.warn(f"Cannot check bedtools version.\n{err}") + return False + + # check bedtools + if not no_bt: + # check at the provided path + if not check_bedtools_version(bt_path): + # if it's not ok at the provided path, check + # the standard system path + if bt_path == "bedtools": + # in this case, there's nowhere else to check + # so give up on bedtools + print( + "bedtools in the environemnt PATH is either", + "older than v.2.30.0 or doesn't exist.", + "\nBiopython will be used.", + ) + no_bt = True + else: + print( + "bedtools specified by bt_path is either", + "older than v.2.30.0 or doesn't exist.", + "\nTry finding bedtools in the environmental PATH.", + ) + # if it's not ok at the standard system path + # fallback to biopython + if not check_bedtools_version("bedtools"): + print( + "bedtools in the environemnt PATH is either", + "older than v.2.30.0 or doesn't exist.", + "\nBiopython will be used.", + ) + no_bt = True + # found it at the system path + else: + bt_path = "bedtools" + print("Using bedtools in the environmental PATH.") + + # create out folder and temp folder inside + # create output folder + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + # specify output file names + filename_prefix = filename_prefix + "_fl" + str(flank_length) + out_fa = os.path.join(output_dir, filename_prefix + ".fa") + out_t2g3col = os.path.join(output_dir, filename_prefix + "_t2g_3col.tsv") + id2name_path = os.path.join(output_dir, "gene_id_to_name.tsv") + + # load gtf + try: + gr = pr.read_gtf(gtf_path) + except ValueError: + # in this case couldn't even run subprocess + raise RuntimeError( + "PyRanges failed to parse the input GTF file. Please check the PyRanges documentation for the expected GTF format constraints.\nhttps://pyranges.readthedocs.io/en/latest/autoapi/pyranges/readers/index.html?highlight=read_gtf#pyranges.readers.read_gtf" + ) + + gr = pr.read_gtf(gtf_path) + + # check the validity of gr + gr = check_gr(gr, output_dir, write_clean_gtf) + + # write gene id to name tsv file + gr.df[["gene_id", "gene_name"]].drop_duplicates().to_csv( + id2name_path, sep="\t", header=False, index=False + ) + + # get introns + # the introns() function uses inplace=True argument from pandas, + # which will trigger an FutureWarning. + warnings.simplefilter(action="ignore", category=FutureWarning) + introns = gr.features.introns(by="transcript") + introns.Name = introns.gene_id + + if no_flanking_merge: + introns = introns.merge(strand=True, by=["Name"], slack=0) + + introns = introns.extend(flank_length) + + if not no_flanking_merge: + introns = introns.merge(strand=True, by=["Name"], slack=0) + + introns.Gene = introns.Name + introns.Name = [ + "-I".join(map(str, z)) + for z in zip( + introns.Name, + introns.Name.groupby(introns.Name) + .cumcount() + .astype(str) + .replace( + "0", + "", + ) + .values, + ) + ] + + # trim outbounded introns + with open(genome_path) as fasta_file: + chromsize = { + title.split()[0]: len(sequence) + for title, sequence in SimpleFastaParser(fasta_file) + } + introns = pr.gf.genome_bounds(introns, chromsize, clip=True) + + # deduplicate introns + if dedup_seqs: + introns.drop_duplicate_positions() + + # add splice status for introns + introns.splice_status = "U" + + # get exons + exons = gr[gr.Feature == "exon"] + + exons.Name = exons.transcript_id + exons.Gene = exons.gene_id + exons = exons.drop(exons.columns[~exons.columns.isin(introns.columns)].tolist()) + exons = exons.sort(["Name", "Start", "End"]) + # add splice status for exons + exons.splice_status = "S" + + # concat spliced transcripts and introns as splici + splici = pr.concat([exons, introns]) + + # splici = splici.sort(["Name", "Start", "End", "Gene"]) + + # write to files + # t2g_3col.tsv + splici.df[["Name", "Gene", "splice_status"]].drop_duplicates().to_csv( + out_t2g3col, sep="\t", header=False, index=False + ) + # print(splici.head()) + tid2strand = dict(zip(splici.Name, splici.Strand)) + + # splici fasta + if not no_bt: + try: + + # create temp folder + temp_dir = os.path.join(output_dir, "temp") + if not os.path.exists(temp_dir): + os.makedirs(temp_dir) + temp_fa = os.path.join(temp_dir, "temp.fa") + temp_bed = os.path.join(temp_dir, "temp.bed") + + # write bed file + splici.to_bed(temp_bed, keep=True) + + # run bedtools, ignore strand for now + bt_r = subprocess.run( + " ".join( + [ + bt_path, + "getfasta", + "-fi", + genome_path, + "-fo", + temp_fa, + "-bed", + temp_bed, + # "-s", + "-nameOnly", + ] + ), + shell=True, + capture_output=True, + ) + + # check return code + if bt_r.returncode != 0: + raise ValueError("Bedtools failed.") + + # parse temp fasta file to concat exons of each transcript + ei_parser = SeqIO.parse(temp_fa, "fasta") + prev_rec = next(ei_parser) + # prev_rec.id = prev_rec.id.split("(")[0] + prev_rec.description = "" + with open(out_fa, "w") as out_handle: + + for seq_record in ei_parser: + # seq_record.id = seq_record.id.split("(")[0] + seq_record.description = "" + if seq_record.id == prev_rec.id: + prev_rec += seq_record + + else: + if tid2strand[prev_rec.id] == "-": + prev_rec = prev_rec.reverse_complement( + id=True, description=True + ) + SeqIO.write(prev_rec, out_handle, "fasta") + prev_rec = seq_record + # Don't forget our last customer + if tid2strand[prev_rec.id] == "-": + prev_rec = prev_rec.reverse_complement(id=True, description=True) + SeqIO.write(prev_rec, out_handle, "fasta") + shutil.rmtree(temp_dir, ignore_errors=True) + except subprocess.CalledProcessError as err: + no_bt = True + warnings.warn(f"Bedtools failed. Use biopython instead.\n{err}") + shutil.rmtree(temp_dir, ignore_errors=True) + + if no_bt: + + from Bio.Seq import Seq + from Bio.SeqFeature import SeqFeature, FeatureLocation + + from Bio.SeqRecord import SeqRecord + + with open(out_fa, "w") as out_handle: + # read fasta, process a chromosome at a time + for seq_record in SeqIO.parse(genome_path, "fasta"): + # get all records on that chromosome + chr_records = introns[introns.Chromosome == seq_record.id].df + if not chr_records.empty: + chr_records.Strand = chr_records.Strand.replace( + ["+", "-"], [+1, -1] + ) + # init seq list + intron_seqs = [] + # for each intron record + for (idx, intron_record) in chr_records.iterrows(): + # create Seqeture object for extracting sequence from chromosome + intron_feature = SeqFeature( + FeatureLocation(intron_record.Start, intron_record.End), + type="intron", + strand=intron_record.Strand, + id=intron_record.Name, + ) + # append the intron sequence to the seq list, specify name as well + intron_seqs.append( + SeqRecord( + intron_feature.extract(seq_record).seq, + id=intron_record.Name, + description="", + ) + ) + # finally, write all intron sequences at once. + SeqIO.write(intron_seqs, out_handle, "fasta") + + # Then, process spliced transcripts + chr_records = exons[exons.Chromosome == seq_record.id].df + if not chr_records.empty: + txp_seqs = [] + # as spliced txps are the concat of all exon sequences, fist get the sequence of each exon separately,then sum them up. + for (tid, exon_records) in chr_records.groupby("Name"): + + # init exon seq list + exon_seqs = [] + # get the sequence of each exon + for (idx, exon_record) in exon_records.iterrows(): + # create SeqFeature object for the exon record + # ignore strand for now, get reverse complement later if needed + exon_feature = SeqFeature( + FeatureLocation(exon_record.Start, exon_record.End), + type="exon", + ) + # extract exon sequence from chromosome and append to exon seq list + exon_seqs.append( + SeqRecord( + exon_feature.extract(seq_record).seq, + id=tid, + description="", + ) + ) + # append the txp sequence to spliced txp seq list + # consider strand + if tid2strand[tid] == "-": + txp_seqs.append( + sum(exon_seqs, Seq("")).reverse_complement( + id=True, description=True + ) + ) + else: + txp_seqs.append(sum(exon_seqs, Seq(""))) + # write all spliced transcript serquence at once. + SeqIO.write(txp_seqs, out_handle, "fasta") + + # append extra spliced transcript onto splici + if extra_spliced is not None: + append_extra(extra_spliced, out_fa, out_t2g3col, id2name_path, "S") + + # append extra unspliced transcript onto splici + if extra_unspliced is not None: + append_extra(extra_unspliced, out_fa, out_t2g3col, id2name_path, "U") + + if dedup_seqs: + # Note: out_fa is intentionally passed as both + # the input and output file name parameters because + # we want to overwirte the duplicate fasta with + # the deduplicated fasta. + dedup_sequences(output_dir, out_fa, out_fa) + +def make_spliceu_txome( + genome_path, + gtf_path, + output_dir, + filename_prefix, + extra_spliced=None, + extra_unspliced=None, + dedup_seqs=False, + no_bt=False, + bt_path="bedtools", + write_clean_gtf=False, +): + """ + Construct the spliceu (spliced + unspliced) transcriptome for alevin-fry. + + Required Parameters + ---------- + genome_path : str + The path to a genome fasta file. + + gtf_path : str + The path to a gtf file. + + output_dir : str + The output directory, where the spliceu reference files will + be written. + + Optional Parameters + ---------- + + filename_prefix : str (default: `spliceu`) + The file name prefix of the generated output files. + The derived flank length will be automatically + appended to the provided prefix. + + extra_spliced : str + A path to a fasta file. The records in this fasta file will be + regarded as spliced transcripts. + + extra_unspliced : str + The path to a fasta file. The records in this fasta file will be + regarded as introns. + + dedup_seqs : bool (default: `False`) + If True, the repeated sequences in the spliceu reference will be + deduplicated. + + no_bt : bool (default: `False`) + If true, biopython, instead of bedtools, will be used for + generating spliceu reference files. + + bt_path : str + The path to bedtools v2.30.0 or greater if it is not in the environment PATH. + + write_clean_gtf : bool (default: `False`) + If true, when the input GTF contains invalid records, a clean GTF + file `clean_gtf.gtf` with these invalid records removed will be + exported to the output dir. + An invalid record is an exon record without an transcript ID + in the `transcript_id` field. + + + Returns + ------- + Nothing will be returned. The spliceu reference files will be written + to disk. + + Notes + ----- + * If using bedtools, a temp.bed and a temp.fa will be created and + then deleted. These two files encode the introns of each gene + and the exons of each transcript of each gene. + + """ + + import pyranges as pr + + import warnings + + import os + + import subprocess + import shutil + + from packaging.version import parse as parse_version + + from Bio import SeqIO + from Bio.SeqIO.FastaIO import SimpleFastaParser + + # Preparation + # check fasta file + if not os.path.isfile(genome_path): + raise IOError("Cannot open the input fasta file!") + + # check gtf file + if not os.path.isfile(gtf_path): + raise IOError("Cannot open the input gtf file!") + + def check_bedtools_version(bt_check_path): + try: + vstr = ( + subprocess.run([bt_path, "--version"], capture_output=True) + .stdout.decode() + .strip() + .split("v")[1] + ) + found_ver = parse_version(vstr) + req_ver = parse_version("2.30.0") + return found_ver >= req_ver + except subprocess.CalledProcessError as err: + # in this case couldn't even run subprocess + warnings.warn(f"Cannot check bedtools version.\n{err}") + return False + + # check bedtools + if not no_bt: + # check at the provided path + if not check_bedtools_version(bt_path): + # if it's not ok at the provided path, check + # the standard system path + if bt_path == "bedtools": + # in this case, there's nowhere else to check + # so give up on bedtools + print( + "bedtools in the environemnt PATH is either", + "older than v.2.30.0 or doesn't exist.", + "\nBiopython will be used.", + ) + no_bt = True + else: + print( + "bedtools specified by bt_path is either", + "older than v.2.30.0 or doesn't exist.", + "\nTry finding bedtools in the environmental PATH.", + ) + # if it's not ok at the standard system path + # fallback to biopython + if not check_bedtools_version("bedtools"): + print( + "bedtools in the environemnt PATH is either", + "older than v.2.30.0 or doesn't exist.", + "\nBiopython will be used.", + ) + no_bt = True + # found it at the system path + else: + bt_path = "bedtools" + print("Using bedtools in the environmental PATH.") + + # create out folder and temp folder inside + # create output folder + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + # specify output file names + out_fa = os.path.join(output_dir, filename_prefix + ".fa") + out_t2g3col = os.path.join(output_dir, filename_prefix + "_t2g_3col.tsv") + out_t2g = os.path.join(output_dir, filename_prefix + "_t2g.tsv") + out_g2g = os.path.join(output_dir, filename_prefix + "_g2g.tsv") + id2name_path = os.path.join(output_dir, "gene_id_to_name.tsv") + + # load gtf + try: + gr = pr.read_gtf(gtf_path) + except ValueError: + # in this case couldn't even run subprocess + raise RuntimeError( + "PyRanges failed to parse the input GTF file. Please check the PyRanges documentation for the expected GTF format constraints.\nhttps://pyranges.readthedocs.io/en/latest/autoapi/pyranges/readers/index.html?highlight=read_gtf#pyranges.readers.read_gtf" + ) + + # check the validity of gr + gr = check_gr(gr, output_dir, write_clean_gtf) + + # write gene id to name tsv file + gr.df[["gene_id", "gene_name"]].drop_duplicates().to_csv( + id2name_path, sep="\t", header=False, index=False + ) + + # get unspliced + unspliced = gr.boundaries("gene_id") + unspliced.Name = unspliced.gene_id + "-I" + unspliced.Gene = unspliced.gene_id + + # add splice status for unspliced + unspliced.splice_status = "U" + + # get exons + exons = gr[gr.Feature == "exon"] + + exons.Name = exons.transcript_id + exons.Gene = exons.gene_id + exons = exons.drop(exons.columns[~exons.columns.isin(unspliced.columns)].tolist()) + exons = exons.sort(["Name", "Start", "End"]) + # add splice status for exons + exons.splice_status = "S" + + # concat spliced transcripts and unspliced as spliceu + spliceu = pr.concat([exons, unspliced]) + + # write to files + # t2g_3col.tsv + t2g_3col = spliceu.df[["Name", "Gene", "splice_status"]].drop_duplicates() + t2g_3col.to_csv( + out_t2g3col, sep="\t", header=False, index=False + ) + + # t2g.csv + t2g_3col[["Name", "Gene"]].to_csv( + out_t2g, sep="\t", header=False, index=False + ) + + # g2g.csv + t2g_3col[["Gene", "Gene"]].to_csv( + out_g2g, sep="\t", header=False, index=False + ) + + # print(spliceu.head()) + tid2strand = dict(zip(spliceu.Name, spliceu.Strand)) + + # spliceu fasta + if not no_bt: + try: + # create temp folder + temp_dir = os.path.join(output_dir, "temp") + if not os.path.exists(temp_dir): + os.makedirs(temp_dir) + temp_fa = os.path.join(temp_dir, "temp.fa") + temp_bed = os.path.join(temp_dir, "temp.bed") + + # write bed file + spliceu.to_bed(temp_bed, keep=True) + + # run bedtools, ignore strand for now + bt_r = subprocess.run( + " ".join( + [ + bt_path, + "getfasta", + "-fi", + genome_path, + "-fo", + temp_fa, + "-bed", + temp_bed, + # "-s", + "-nameOnly", + ] + ), + shell=True, + capture_output=True, + ) + + # check return code + if bt_r.returncode != 0: + raise ValueError("Bedtools failed.") + + # parse temp fasta file to concat exons of each transcript + ei_parser = SeqIO.parse(temp_fa, "fasta") + prev_rec = next(ei_parser) + # prev_rec.id = prev_rec.id.split("(")[0] + prev_rec.description = "" + with open(out_fa, "w") as out_handle: + + for seq_record in ei_parser: + # seq_record.id = seq_record.id.split("(")[0] + seq_record.description = "" + if seq_record.id == prev_rec.id: + prev_rec += seq_record + + else: + if tid2strand[prev_rec.id] == "-": + prev_rec = prev_rec.reverse_complement( + id=True, description=True + ) + SeqIO.write(prev_rec, out_handle, "fasta") + prev_rec = seq_record + # Don't forget our last customer + if tid2strand[prev_rec.id] == "-": + prev_rec = prev_rec.reverse_complement(id=True, description=True) + SeqIO.write(prev_rec, out_handle, "fasta") + shutil.rmtree(temp_dir, ignore_errors=True) + except subprocess.CalledProcessError as err: + no_bt = True + warnings.warn(f"Bedtools failed. Use biopython instead.\n{err}") + shutil.rmtree(temp_dir, ignore_errors=True) + + if no_bt: + + from Bio.Seq import Seq + from Bio.SeqFeature import SeqFeature, FeatureLocation + + from Bio.SeqRecord import SeqRecord + + with open(out_fa, "w") as out_handle: + # read fasta, process a chromosome at a time + for seq_record in SeqIO.parse(genome_path, "fasta"): + # get all records on that chromosome + chr_records = unspliced[unspliced.Chromosome == seq_record.id].df + if not chr_records.empty: + chr_records.Strand = chr_records.Strand.replace( + ["+", "-"], [+1, -1] + ) + # init seq list + unspliced_seqs = [] + # for each unspliced record + for (idx, unspliced_record) in chr_records.iterrows(): + # create Seqeture object for extracting sequence from chromosome + unspliced_feature = SeqFeature( + FeatureLocation(unspliced_record.Start, unspliced_record.End), + type="unspliced", + id=unspliced_record.Name, + ) + unspliced_feature.strand=unspliced_record.Strand + # append the unspliced sequence to the seq list, specify name as well + unspliced_seqs.append( + SeqRecord( + unspliced_feature.extract(seq_record).seq, + id=unspliced_record.Name, + description="", + ) + ) + # finally, write all unspliced sequences at once. + SeqIO.write(unspliced_seqs, out_handle, "fasta") + + # Then, process spliced transcripts + chr_records = exons[exons.Chromosome == seq_record.id].df + if not chr_records.empty: + txp_seqs = [] + # as spliced txps are the concat of all exon sequences, fist get the sequence of each exon separately,then sum them up. + for (tid, exon_records) in chr_records.groupby("Name"): + + # init exon seq list + exon_seqs = [] + # get the sequence of each exon + for (idx, exon_record) in exon_records.iterrows(): + # create SeqFeature object for the exon record + # ignore strand for now, get reverse complement later if needed + exon_feature = SeqFeature( + FeatureLocation(exon_record.Start, exon_record.End), + type="exon", + ) + # extract exon sequence from chromosome and append to exon seq list + exon_seqs.append( + SeqRecord( + exon_feature.extract(seq_record).seq, + id=tid, + description="", + ) + ) + # append the txp sequence to spliced txp seq list + # consider strand + if tid2strand[tid] == "-": + txp_seqs.append( + sum(exon_seqs, Seq("")).reverse_complement( + id=True, description=True + ) + ) + else: + txp_seqs.append(sum(exon_seqs, Seq(""))) + # write all spliced transcript serquence at once. + SeqIO.write(txp_seqs, out_handle, "fasta") + + # append extra spliced transcript onto spliceu + if extra_spliced is not None: + append_extra(extra_spliced, out_fa, out_t2g3col, id2name_path, "S") + + # append extra unspliced transcript onto spliceu + if extra_unspliced is not None: + append_extra(extra_unspliced, out_fa, out_t2g3col, id2name_path, "U") + + if dedup_seqs: + # Note: out_fa is intentionally passed as both + # the input and output file name parameters because + # we want to overwirte the duplicate fasta with + # the deduplicated fasta. + dedup_sequences(output_dir, out_fa, out_fa) From 4a9b7e03a058ee0427a6f007d20fb65bfc281cf5 Mon Sep 17 00:00:00 2001 From: DongzeHe Date: Fri, 6 Jan 2023 17:04:41 -0500 Subject: [PATCH 2/6] update readme --- README.md | 51 ++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 60586ed..594eb89 100644 --- a/README.md +++ b/README.md @@ -13,20 +13,20 @@ The `pyroe` package provides useful functions for analyzing single-cell or singl ## Installation The `pyroe` package can be accessed from its [github repository](https://github.com/COMBINE-lab/pyroe), installed via [`pip`](https://pip.pypa.io/en/stable/). To install the `pyroe` package via `pip` use the command: -``` +```sh pip install pyroe ``` To make use of the `load_fry` function (which, itself, installs [scanpy](https://scanpy.readthedocs.io/en/stable/)), you should also be sure to install the package with the `scanpy` extra: -``` +```sh pip install pyroe[scanpy] ``` Alternatively, `pyroe` can be installed via `bioconda`, which will automatically install the variant of the package including `load_fry`, and will also install `bedtools` to enable faster construction of the *splici* reference (see below). This installation can be performed with the command: -``` +```sh conda install pyroe ``` @@ -39,7 +39,7 @@ The USA mode in alevin-fry requires a special index reference, which is called t Following is an example of calling the `pyroe` to make the *splici* index reference. The final flank length is calculated as the difference between the read length and the flank_trim_length, i.e., 5-2=3. This function allows you to add extra spliced and unspliced sequences to the *splici* index, which will be useful when some unannotated sequences, such as mitochondrial genes, are important for your experiment. **Note** : to make `pyroe` work more quickly, it is recommended to have the latest version of [`bedtools`](https://bedtools.readthedocs.io/en/latest/) ([Aaron R. Quinlan and Ira M. Hall, 2010](https://doi.org/10.1093/bioinformatics/btq033)) installed. -``` +```sh pyroe make-splici extdata/small_example_genome.fa extdata/small_example.gtf 5 splici_txome \ --flank-trim-length 2 --filename-prefix transcriptome_splici --dedup-seqs ``` @@ -90,6 +90,47 @@ optional arguments: The *splici* index of a given species consists of the transcriptome of the species, i.e., the spliced transcripts, and the intronic sequences of the species. Within a gene, if the flanked intronic sequences overlap with each other, the overlapped intronic sequences will be collapsed as a single intronic sequence to make sure each base will appear only once in the intronic sequences. For more detailed information, please check the section S2 in the supplementary file of [alevin-fry manuscript](https://www.biorxiv.org/content/10.1101/2021.06.29.450377v2). +## Prepare spliceu index for quantification with alevin-fry + +Recently, [He et al.](https://www.biorxiv.org/content/10.1101/2023.01.04.522742v1) introduced the *splice*d+*u*nspliced (_spliceu_) index in alevin-fry. This requires the _spliceu_ transcriptome. The command of making an *spliceu* transcriptome reference is similar to making a _splici_ reference: + +```sh +pyroe make-spliceu extdata/small_example_genome.fa extdata/small_example.gtf spliceu_txome \ +--filename-prefix transcriptome_spliceu +``` + +### Full usage +``` +usage: pyroe make-spliceu [-h] [--filename-prefix FILENAME_PREFIX] + [--extra-spliced EXTRA_SPLICED] + [--extra-unspliced EXTRA_UNSPLICED] + [--bt-path BT_PATH] [--no-bt] [--dedup-seqs] + [--write-clean-gtf] + genome-path gtf-path output-dir + +positional arguments: + genome-path The path to a genome fasta file. + gtf-path The path to a gtf file. + output-dir The output directory where Spliceu reference files + will be written. + +options: + -h, --help show this help message and exit + --filename-prefix FILENAME_PREFIX + The file name prefix of the generated output files. + --extra-spliced EXTRA_SPLICED + The path to an extra spliced sequence fasta file. + --extra-unspliced EXTRA_UNSPLICED + The path to an extra unspliced sequence fasta file. + --bt-path BT_PATH The path to bedtools v2.30.0 or greater. + --no-bt A flag indicates whether bedtools will be used for + generating Spliceu reference files. + --dedup-seqs A flag indicates whether identical sequences will be + deduplicated. + --write-clean-gtf A flag indicates whether a clean gtf will be written + if encountered invalid records. +``` + ## Processing alevin-fry quantification result The quantification result of alevin-fry can be loaded into python by the `load_fry()` function. This function takes a output directory returned by `alevin-fry quant` command as the minimum input, and load the quantification result as an `AnnData` object. When processing USA mode result, it assumes that the data comes from a single-cell RNA-sequencing experiment. If one wants to process single-nucleus RNA-sequencing data or prepare the single-cell data for RNA-velocity analysis, the `output_format` argument should be set as `snRNA` or `velocity` correspondingly. One can also define customized output format, see the Full Usage section for detail. @@ -176,7 +217,7 @@ We provide two python functions: - `load_processed_quant()` can fetch the quantification result of one or more available dataset as `fetch_processed_quant()`, and load them into python as `AnnData` objects. We also provide a CLI for fetching quantification results. -```bash +```sh pyroe fetch-quant 1 3 6 ``` From a7b6d5b0c38da48a25501821b152b2848ff7a7c7 Mon Sep 17 00:00:00 2001 From: DongzeHe Date: Fri, 6 Jan 2023 17:14:50 -0500 Subject: [PATCH 3/6] make flake8 happy --- src/pyroe/make_txome.py | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/src/pyroe/make_txome.py b/src/pyroe/make_txome.py index 58dac41..a26cc74 100644 --- a/src/pyroe/make_txome.py +++ b/src/pyroe/make_txome.py @@ -649,6 +649,7 @@ def check_bedtools_version(bt_check_path): # the deduplicated fasta. dedup_sequences(output_dir, out_fa, out_fa) + def make_spliceu_txome( genome_path, gtf_path, @@ -736,7 +737,8 @@ def make_spliceu_txome( from packaging.version import parse as parse_version from Bio import SeqIO - from Bio.SeqIO.FastaIO import SimpleFastaParser + + # from Bio.SeqIO.FastaIO import SimpleFastaParser # Preparation # check fasta file @@ -851,20 +853,14 @@ def check_bedtools_version(bt_check_path): # write to files # t2g_3col.tsv t2g_3col = spliceu.df[["Name", "Gene", "splice_status"]].drop_duplicates() - t2g_3col.to_csv( - out_t2g3col, sep="\t", header=False, index=False - ) - + t2g_3col.to_csv(out_t2g3col, sep="\t", header=False, index=False) + # t2g.csv - t2g_3col[["Name", "Gene"]].to_csv( - out_t2g, sep="\t", header=False, index=False - ) - + t2g_3col[["Name", "Gene"]].to_csv(out_t2g, sep="\t", header=False, index=False) + # g2g.csv - t2g_3col[["Gene", "Gene"]].to_csv( - out_g2g, sep="\t", header=False, index=False - ) - + t2g_3col[["Gene", "Gene"]].to_csv(out_g2g, sep="\t", header=False, index=False) + # print(spliceu.head()) tid2strand = dict(zip(spliceu.Name, spliceu.Strand)) @@ -957,11 +953,13 @@ def check_bedtools_version(bt_check_path): for (idx, unspliced_record) in chr_records.iterrows(): # create Seqeture object for extracting sequence from chromosome unspliced_feature = SeqFeature( - FeatureLocation(unspliced_record.Start, unspliced_record.End), + FeatureLocation( + unspliced_record.Start, unspliced_record.End + ), type="unspliced", id=unspliced_record.Name, ) - unspliced_feature.strand=unspliced_record.Strand + unspliced_feature.strand = unspliced_record.Strand # append the unspliced sequence to the seq list, specify name as well unspliced_seqs.append( SeqRecord( From b8331c21efdd56653e030e0008070f1b9b420262 Mon Sep 17 00:00:00 2001 From: DongzeHe Date: Fri, 6 Jan 2023 18:29:49 -0500 Subject: [PATCH 4/6] write test function for make_spliceu --- src/pyroe/make_txome.py | 3 +- tests/test_make_spliceu.py | 127 +++++++++++++++++++++++++++++++++++++ tests/test_make_splici.py | 2 +- 3 files changed, 130 insertions(+), 2 deletions(-) create mode 100644 tests/test_make_spliceu.py diff --git a/src/pyroe/make_txome.py b/src/pyroe/make_txome.py index a26cc74..770ee8f 100644 --- a/src/pyroe/make_txome.py +++ b/src/pyroe/make_txome.py @@ -582,9 +582,10 @@ def check_bedtools_version(bt_check_path): intron_feature = SeqFeature( FeatureLocation(intron_record.Start, intron_record.End), type="intron", - strand=intron_record.Strand, id=intron_record.Name, ) + intron_feature.strand = intron_record.Strand + # append the intron sequence to the seq list, specify name as well intron_seqs.append( SeqRecord( diff --git a/tests/test_make_spliceu.py b/tests/test_make_spliceu.py new file mode 100644 index 0000000..8d39295 --- /dev/null +++ b/tests/test_make_spliceu.py @@ -0,0 +1,127 @@ +from pyroe.make_txome import make_spliceu_txome +import tempfile +import os +import pandas as pd +from Bio import SeqIO +from Bio.Seq import Seq + + +def test_make_spliceu(): + location = os.path.dirname(os.path.realpath(__file__)) + test_file_dir = os.path.join(location, "data", "test_make_splici") + genome_path = os.path.join(test_file_dir, "small_example_genome.fa") + gtf_path = os.path.join(test_file_dir, "small_example.gtf") + filename_prefix = "spliceu" + extra_spliced = os.path.join(test_file_dir, "extra_spliced.txt") + extra_unspliced = os.path.join(test_file_dir, "extra_unspliced.txt") + output_dir = tempfile.TemporaryDirectory() + make_spliceu_txome( + genome_path=genome_path, + gtf_path=gtf_path, + extra_spliced=extra_spliced, + extra_unspliced=extra_unspliced, + output_dir=output_dir.name, + filename_prefix=filename_prefix, + no_bt=True, + ) + + out_fa = os.path.join(output_dir.name, "spliceu.fa") + out_t2g = os.path.join(output_dir.name, "spliceu_t2g_3col.tsv") + genome = SeqIO.to_dict(SeqIO.parse(genome_path, "fasta")) + # gene_annotation = pr.read_gtf(gtf_path).df + spliceu_seqs = SeqIO.to_dict(SeqIO.parse(out_fa, "fasta")) + t2g_colnames = ["txp_name", "gene_name", "splice_status"] + t2g_3col = pd.read_csv( + out_t2g, sep="\t", header=None, names=t2g_colnames + ).set_index(t2g_colnames[0]) + + # Define expected sequences + # For each gene, + # we have 3 spliced txps. Indexing starting from 1 + # tx1 ([1,2], [36, 45], [71,80]) + # tx2 ([46,55], [91, 100]) + # txp3 ([121,130], [156, 160], [191,200]) + # unspliced [1,200] + + chr_1_seq = genome["chr1"].seq + chr_2_seq = genome["chr2"].seq + # check spliced sequences + # tx1_1 + extracted_tx1_1 = spliceu_seqs["tx1.1"].seq + expected_tx1_1 = sum([chr_1_seq[0:2], chr_1_seq[35:45], chr_1_seq[70:80]], Seq("")) + assert extracted_tx1_1 == expected_tx1_1 + + # tx1_2 + extracted_tx1_2 = spliceu_seqs["tx1.2"].seq + expected_tx1_2 = sum([chr_1_seq[45:55], chr_1_seq[90:100]], Seq("")) + assert extracted_tx1_2 == expected_tx1_2 + + # tx1_3 + extracted_tx1_3 = spliceu_seqs["tx1.3"].seq + expected_tx1_3 = sum( + [chr_1_seq[120:130], chr_1_seq[155:160], chr_1_seq[190:200]], Seq("") + ) + assert extracted_tx1_3 == expected_tx1_3 + + # tx2_1 + extracted_tx2_1 = spliceu_seqs["tx2.1"].seq + expected_tx2_1 = sum( + [chr_2_seq[0:2], chr_2_seq[35:45], chr_2_seq[70:80]], Seq("") + ).reverse_complement() + assert extracted_tx2_1 == expected_tx2_1 + + # tx2_2 + extracted_tx2_2 = spliceu_seqs["tx2.2"].seq + expected_tx2_2 = sum( + [chr_2_seq[45:55], chr_2_seq[90:100]], Seq("") + ).reverse_complement() + assert extracted_tx2_2 == expected_tx2_2 + + # tx2_3 + extracted_tx2_3 = spliceu_seqs["tx2.3"].seq + expected_tx2_3 = sum( + [chr_2_seq[120:130], chr_2_seq[155:160], chr_2_seq[190:200]], Seq("") + ).reverse_complement() + assert extracted_tx2_3 == expected_tx2_3 + + # check intron seqs + # g1_I [0,200] + + extract_g1_I = spliceu_seqs["g1-I"].seq + expected_g1_I = sum([chr_1_seq[0:200]], Seq("")) + assert extract_g1_I == expected_g1_I + + # g2_I [0,200] + + extract_g2_I = spliceu_seqs["g2-I"].seq + expected_g2_I = sum([chr_2_seq[0:200]], Seq("")).reverse_complement() + assert extract_g2_I == expected_g2_I + + # check t2g_3col + t2g_dict = t2g_3col[[t2g_colnames[1]]].to_dict()["gene_name"] + t2s_dict = t2g_3col[[t2g_colnames[2]]].to_dict()["splice_status"] + + # check txp name to gene name mapping + # for g1 + assert t2g_dict["tx1.1"] == "g1" + assert t2g_dict["tx1.2"] == "g1" + assert t2g_dict["tx1.3"] == "g1" + assert t2g_dict["g1-I"] == "g1" + + # for g2 + assert t2g_dict["tx2.1"] == "g2" + assert t2g_dict["tx2.2"] == "g2" + assert t2g_dict["tx2.3"] == "g2" + assert t2g_dict["g2-I"] == "g2" + + # check txp name to splice status mapping + assert t2s_dict["tx1.1"] == "S" + assert t2s_dict["tx1.2"] == "S" + assert t2s_dict["tx1.3"] == "S" + assert t2s_dict["g1-I"] == "U" + + # for S + assert t2s_dict["tx2.1"] == "S" + assert t2s_dict["tx2.2"] == "S" + assert t2s_dict["tx2.3"] == "S" + assert t2s_dict["g2-I"] == "U" diff --git a/tests/test_make_splici.py b/tests/test_make_splici.py index 32798eb..21cb32f 100644 --- a/tests/test_make_splici.py +++ b/tests/test_make_splici.py @@ -1,4 +1,4 @@ -from pyroe.make_splici_txome import make_splici_txome +from pyroe.make_txome import make_splici_txome import tempfile import os import pandas as pd From e22b8c04d63ef13e160c065c54f27ffc701b468a Mon Sep 17 00:00:00 2001 From: DongzeHe Date: Fri, 6 Jan 2023 18:35:15 -0500 Subject: [PATCH 5/6] fix: minor fix --- tests/test_make_spliceu.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/test_make_spliceu.py b/tests/test_make_spliceu.py index 8d39295..a168ff1 100644 --- a/tests/test_make_spliceu.py +++ b/tests/test_make_spliceu.py @@ -25,10 +25,9 @@ def test_make_spliceu(): no_bt=True, ) - out_fa = os.path.join(output_dir.name, "spliceu.fa") - out_t2g = os.path.join(output_dir.name, "spliceu_t2g_3col.tsv") + out_fa = os.path.join(output_dir.name, "_".join([filename_prefix, "spliceu.fa"])) + out_t2g = os.path.join(output_dir.name, "_".join([filename_prefix, "spliceu_t2g_3col.tsv"])) genome = SeqIO.to_dict(SeqIO.parse(genome_path, "fasta")) - # gene_annotation = pr.read_gtf(gtf_path).df spliceu_seqs = SeqIO.to_dict(SeqIO.parse(out_fa, "fasta")) t2g_colnames = ["txp_name", "gene_name", "splice_status"] t2g_3col = pd.read_csv( From 24509936a59a95764bbbcb8a9f551df83504e240 Mon Sep 17 00:00:00 2001 From: DongzeHe Date: Fri, 6 Jan 2023 20:30:28 -0500 Subject: [PATCH 6/6] fix: minor fix --- tests/test_make_spliceu.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_make_spliceu.py b/tests/test_make_spliceu.py index a168ff1..0da4eb5 100644 --- a/tests/test_make_spliceu.py +++ b/tests/test_make_spliceu.py @@ -25,8 +25,8 @@ def test_make_spliceu(): no_bt=True, ) - out_fa = os.path.join(output_dir.name, "_".join([filename_prefix, "spliceu.fa"])) - out_t2g = os.path.join(output_dir.name, "_".join([filename_prefix, "spliceu_t2g_3col.tsv"])) + out_fa = os.path.join(output_dir.name, "".join([filename_prefix, ".fa"])) + out_t2g = os.path.join(output_dir.name, "".join([filename_prefix, "_t2g_3col.tsv"])) genome = SeqIO.to_dict(SeqIO.parse(genome_path, "fasta")) spliceu_seqs = SeqIO.to_dict(SeqIO.parse(out_fa, "fasta")) t2g_colnames = ["txp_name", "gene_name", "splice_status"]