From ca0d2521ae3a07f8aef977b5f91c06923e58a2b1 Mon Sep 17 00:00:00 2001
From: DongzeHe <dongzehe.zaza@gmail.com>
Date: Fri, 6 Jan 2023 16:51:22 -0500
Subject: [PATCH 1/6] feat: add make spliceu txome function, and fix biopython
 Strand warning

---
 bin/pyroe               |   75 ++-
 setup.cfg               |    2 +-
 src/pyroe/__init__.py   |    4 +-
 src/pyroe/make_txome.py | 1027 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 1104 insertions(+), 4 deletions(-)
 mode change 100644 => 100755 bin/pyroe
 create mode 100644 src/pyroe/make_txome.py

diff --git a/bin/pyroe b/bin/pyroe
old mode 100644
new mode 100755
index ab105ae..c4a4296
--- a/bin/pyroe
+++ b/bin/pyroe
@@ -2,7 +2,7 @@
 
 import logging
 
-from pyroe import make_splici_txome
+from pyroe import make_splici_txome, make_spliceu_txome
 from pyroe import fetch_processed_quant
 from pyroe import convert
 from pyroe import id_to_name
@@ -22,12 +22,15 @@ if __name__ == "__main__":
     parser.add_argument(
         "-v", "--version", action="version", version=f"pyroe {__version__}"
     )
+    
     subparsers = parser.add_subparsers(
         title="subcommands",
         dest="command",
         description="valid subcommands",
         help="additional help",
     )
+    
+    # make-splici
     parser_makeSplici = subparsers.add_parser(
         "make-splici", help="Make splici reference"
     )
@@ -101,6 +104,63 @@ if __name__ == "__main__":
         help="A flag indicates whether a clean gtf will be written if encountered invalid records.",
     )
 
+    # make-spliceu
+    parser_makeSpliceu = subparsers.add_parser(
+        "make-spliceu", help="Make spliceu reference"
+    )
+    parser_makeSpliceu.add_argument(
+        "genome_path",
+        metavar="genome-path",
+        type=str,
+        help="The path to a genome fasta file.",
+    )
+    parser_makeSpliceu.add_argument(
+        "gtf_path", metavar="gtf-path", type=str, help="The path to a gtf file."
+    )
+    parser_makeSpliceu.add_argument(
+        "output_dir",
+        metavar="output-dir",
+        type=str,
+        help="The output directory where Spliceu reference files will be written.",
+    )
+    parser_makeSpliceu.add_argument(
+        "--filename-prefix",
+        type=str,
+        default="spliceu",
+        help="The file name prefix of the generated output files.",
+    )
+    parser_makeSpliceu.add_argument(
+        "--extra-spliced",
+        type=str,
+        help="The path to an extra spliced sequence fasta file.",
+    )
+    parser_makeSpliceu.add_argument(
+        "--extra-unspliced",
+        type=str,
+        help="The path to an extra unspliced sequence fasta file.",
+    )
+    parser_makeSpliceu.add_argument(
+        "--bt-path",
+        type=str,
+        default="bedtools",
+        help="The path to bedtools v2.30.0 or greater.",
+    )
+    parser_makeSpliceu.add_argument(
+        "--no-bt",
+        action="store_true",
+        help="A flag indicates whether bedtools will be used for generating Spliceu reference files.",
+    )
+    parser_makeSpliceu.add_argument(
+        "--dedup-seqs",
+        action="store_true",
+        help="A flag indicates whether identical sequences will be deduplicated.",
+    )
+    parser_makeSpliceu.add_argument(
+        "--write-clean-gtf",
+        action="store_true",
+        help="A flag indicates whether a clean gtf will be written if encountered invalid records.",
+    )
+
     # parse available datasets
     available_datasets = fetch_processed_quant()
     epilog = "\n".join(
@@ -212,6 +272,19 @@ if __name__ == "__main__":
             no_flanking_merge=args.no_flanking_merge,
             write_clean_gtf=args.write_clean_gtf,
         )
+    elif args.command == "make-spliceu":
+        make_spliceu_txome(
+            genome_path=args.genome_path,
+            gtf_path=args.gtf_path,
+            output_dir=args.output_dir,
+            filename_prefix=args.filename_prefix,
+            extra_spliced=args.extra_spliced,
+            extra_unspliced=args.extra_unspliced,
+            dedup_seqs=args.dedup_seqs,
+            no_bt=args.no_bt,
+            bt_path=args.bt_path,
+            write_clean_gtf=args.write_clean_gtf,
+        )
     elif args.command == "fetch-quant":
         fetch_processed_quant(
             dataset_ids=args.dataset_ids,
diff --git a/setup.cfg b/setup.cfg
index 6155b22..8c8bee3 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = pyroe
-version = 0.6.4
+version = 0.7.0
 author = Dongze He, Rob Patro
 author_email = dhe17@umd.edu, rob@cs.umd.edu
 description = utilities of alevin-fry
diff --git a/src/pyroe/__init__.py b/src/pyroe/__init__.py
index 89ce72b..3a7b831 100644
--- a/src/pyroe/__init__.py
+++ b/src/pyroe/__init__.py
@@ -1,7 +1,7 @@
-__version__ = "0.6.4"
+__version__ = "0.7.0"
 
 from pyroe.load_fry import load_fry
-from pyroe.make_splici_txome import make_splici_txome
+from pyroe.make_txome import make_splici_txome, make_spliceu_txome
 from pyroe.fetch_processed_quant import fetch_processed_quant
 from pyroe.load_processed_quant import load_processed_quant
 from pyroe.ProcessedQuant import ProcessedQuant
diff --git a/src/pyroe/make_txome.py b/src/pyroe/make_txome.py
new file mode 100644
index 0000000..58dac41
--- /dev/null
+++ b/src/pyroe/make_txome.py
@@ -0,0 +1,1027 @@
+def append_extra(extra_infile, out_fa, out_t2g3col, id2name_path, col_status):
+    """
+    extra_infile : str
+        path to the extra FASTA format input file from which to read the extra records
+    out_fa : str
+        path to the output FASTA file being created (will be *appended* to)
+    out_t2g3col : str
+        path to the output t2g 3-column file (will be *appended* to)
+    id2name_path : str
+        path to the output (existing) gene id to gene name tsv (will be *appended* to)
+    col_status : char
+        splicing status to use for these records (one of 'S' or 'U')
+    """
+
+    from Bio.SeqIO.FastaIO import SimpleFastaParser
+
+    # write extra sequences to the t2g file
+    with open(extra_infile) as extra_in_fa:
+        with open(out_fa, "a") as splici_fa:
+            with open(out_t2g3col, "a") as t2g:
+                with open(id2name_path, "a") as id_to_name:
+                    for title, sequence in SimpleFastaParser(extra_in_fa):
+                        tid = title.split()[0]
+                        splici_fa.write(f">{tid}\n")
+                        splici_fa.write(f"{sequence}\n")
+                        t2g.write(f"{tid}\t{tid}\t{col_status}\n")
+                        id_to_name.write(f"{tid}\t{tid}\n")
+
+
+def dedup_sequences(output_dir, in_fa, out_fa):
+    """
+    This function deduplicates the fasta sequences in `in_fa`, writes the
+    deduplicated sequences in fasta format to `out_fa`, and also produces a
+    2-column tsv file in `ouput_dir`/duplicate_entries.tsv recording the
+    entries that have been removed and the retained entry that they match.
+    Note: It is allowed (and in some cases intended) that `in_fa` == `out_fa`.
+    In this case, the input fasta file will be overwritten.
+
+    output_dir : str
+        The output directory where duplicate_entries.tsv will be written
+    in_fa : str
+        The path to the input fasta file to be deduplicated
+    out_fa : str
+        The path to where the ouput deduplicated fasta file should be written
+    """
+    import os
+    from Bio import SeqIO
+    from Bio.Seq import Seq
+    from Bio.SeqRecord import SeqRecord
+
+    record_representatives = {}
+
+    # read from the input fasta file and track
+    # the duplicate sequences
+    for record in SeqIO.parse(in_fa, "fasta"):
+        seq = str(record.seq)
+        name = record.id.split()[0]
+        # if we haven't seen it yet, this is the representative
+        # otherwise append the name of the duplicate
+        if seq in record_representatives:
+            record_representatives[seq].append(name)
+        else:
+            record_representatives[seq] = [name]
+
+    # write out the duplicate entries in the same format
+    # the salmon / pufferfish indexer uses
+    dup_file_name = os.path.sep.join([output_dir, "duplicate_entries.tsv"])
+    with open(dup_file_name, "w") as dup_file:
+        dup_file.write("RetainedRef\tDuplicateRef\n")
+        for k, vu in record_representatives.items():
+            if len(vu) > 1:
+                v = sorted(vu)
+                for dup in v[1:]:
+                    dup_file.write(f"{v[0]}\t{dup}\n")
+
+    # write the deduplicated output to file
+    # Note: it might be that out_file == in_file and we
+    # are overwriting the input.
+    with open(out_fa, "w") as ofile:
+        for k, vu in record_representatives.items():
+            v = sorted(vu)
+            rec = SeqRecord(Seq(k), id=v[0], description="")
+            SeqIO.write(rec, ofile, "fasta")
+
+
+def check_gr(gr, output_dir, write_clean_gtf):
+    """
+    This function checks the validity of a PyRanges object.
+    It will follow these rules:
+    1. For gene type records, gene_id and gene_name cannot be both missing.
+    2. All other types of records have to have a valid (not NaN) transcript_id.
+    3. For all types of records other than gene type,
+        - If the gene_id and gene_name are both missing, then the transcript_id
+    will be used to impute them.
+        - If one of gene_id and gene_name is missing, then the other will be
+    used to impute the missing one.
+
+    Args:
+        gr (pyranges): Stranded PyRanges object
+    """
+
+    import pandas as pd
+    import os
+    import pyranges as pr
+    import warnings
+
+    # split gene type records with others
+    # we don't use gene records in splici construction
+    gene_gr = gr[gr.Feature == "gene"]
+    gr = gr[gr.Feature != "gene"]
+    clean_gtf_path = os.path.join(output_dir, "clean_gtf.gtf")
+
+    # If required fields are missing, quit
+    if "transcript_id" not in gr.columns:
+        raise ValueError(
+            "The input GTF file doesn't contain transcript_id field; Cannot proceed."
+        )
+
+    if "gene_id" not in gr.columns:
+        # use gene_name as gene_id if exists, return an error otherwise
+        if "gene_name" not in gr.columns:
+            raise ValueError(
+                "The input GTF file doesn't contain gene_id and gene_name field; Cannot proceed."
+            )
+        else:
+            warnings.warn("gene_id field does not exist, use gene_name as gene_id.")
+            gene_id = pd.Series(data=gr.gene_name, name="gene_id")
+            gr = gr.insert(gene_id)
+
+    if "gene_name" not in gr.columns:
+        warnings.warn("gene_name field does not exist, use gene_id as gene_name.")
+        gene_name = pd.Series(data=gr.gene_id, name="gene_name")
+        gr = gr.insert(gene_name)
+
+    # If there is any NaN in transcript_id field, quit
+    if sum(gr.transcript_id.isnull()):
+
+        # first, write an clean GTF if needed
+        if write_clean_gtf:
+            gr = pr.concat([gene_gr, gr[gr.transcript_id.notnull()]])
+            gr.to_gtf(clean_gtf_path)
+            # gr[gr.transcript_id.notnull()].to_gtf(clean_gtf_path)
+            clean_gtf_msg = f"An clean GTF file is written to {clean_gtf_path}."
+        else:
+            clean_gtf_msg = "Set the write_clean_gtf flag if a clean GTF without the invalid records is needed."
+        # Then, raise a value error
+        raise ValueError(
+            f"Found NaN value in exons' transcript ID; Cannot proceed.\n{clean_gtf_msg}"
+        )
+
+    # Impute missing gene_id and gene_name values
+    # define an object
+    num_nan = gr.df[["gene_id", "gene_name"]].isnull().sum(axis=1)
+
+    if num_nan.sum():
+
+        # create intermediate df
+        gene_df = gr.df[["gene_id", "gene_name"]]
+
+        # Firstly, write the problematic records to a file before imputation
+        problematic_gtf_path = os.path.join(
+            output_dir, "missing_gene_id_or_name_records.gtf"
+        )
+        gr[num_nan > 0].to_gtf(problematic_gtf_path)
+        missing_record_msg = f"\nFound records with missing gene_id/gene_name field.\nThese records are reported in {problematic_gtf_path}."
+
+        # impute using transcript_id
+        double_missing = num_nan == 2
+        if double_missing.sum():
+            gene_df.loc[num_nan == 2, "gene_id"] = gr.transcript_id[num_nan == 2]
+            gene_df.loc[num_nan == 2, "gene_name"] = gr.transcript_id[num_nan == 2]
+            double_missing_msg = f"\n  - Found {(num_nan == 2).sum()} records missing gene_id and gene_name, imputing using transcript_id."
+        else:
+            double_missing_msg = ""
+
+        # If one field is missing, impute using the other
+        # missing only gene_id
+        gene_id_missing = gene_df["gene_id"].isnull()
+        if gene_id_missing.sum():
+            gene_df.loc[gene_df["gene_id"].isnull(), "gene_id"] = gene_df.loc[
+                gene_df["gene_id"].isnull(), "gene_name"
+            ]
+            gene_id_missing_msg = f"\n  - Found {gene_id_missing.sum()} records missing gene_id, imputing using gene_name."
+        else:
+            gene_id_missing_msg = ""
+
+        gene_name_missing = gene_df["gene_name"].isnull()
+        if gene_name_missing.sum():
+            gene_df.loc[gene_df["gene_name"].isnull(), "gene_name"] = gene_df.loc[
+                gene_df["gene_name"].isnull(), "gene_id"
+            ]
+            gene_name_missing_msg = f"\n  - Found {gene_name_missing.sum()} records missing gene_name, imputing using gene_id."
+        else:
+            gene_name_missing_msg = ""
+
+        # write the warning message
+        warnings.warn(
+            "".join(
+                [
+                    missing_record_msg,
+                    double_missing_msg,
+                    gene_id_missing_msg,
+                    gene_name_missing_msg,
+                ]
+            )
+        )
+
+        # replace the old gene_id and gene_name fields using imputed one.
+        gr = gr.drop(["gene_id", "gene_name"])
+        gr = gr.insert(gene_df)
+        # if gene_gr is used in the future, then concat them.
+        if write_clean_gtf:
+            clean_gr = pr.concat([gene_gr, gr])
+            clean_gr.to_gtf(clean_gtf_path)
+            clean_gtf_msg = f"An clean GTF file is written to {clean_gtf_path}."
+            print(clean_gtf_msg)
+
+            # gr = pr.concat([gr, gene_gr])
+
+    # return imputed gr
+    return gr
+
+
+def make_splici_txome(
+    genome_path,
+    gtf_path,
+    read_length,
+    output_dir,
+    flank_trim_length=5,
+    filename_prefix="splici",
+    extra_spliced=None,
+    extra_unspliced=None,
+    dedup_seqs=False,
+    no_bt=False,
+    bt_path="bedtools",
+    no_flanking_merge=False,
+    write_clean_gtf=False,
+):
+    """
+    Construct the splici (spliced + introns) transcriptome for alevin-fry.
+
+    Required Parameters
+    ----------
+    genome_path : str
+        The path to a genome fasta file.
+
+    gtf_path : str
+        The path to a gtf file.
+
+    read_length : int
+        The read length of the single-cell experiment being processed.
+
+    output_dir : str
+        The output directory, where the splici reference files will
+        be written.
+
+    Optional Parameters
+    ----------
+
+    flank_trim_length : int (default: `5`)
+        The flank trimming length.
+        The final flank length is obtained by subtracting
+        the flank_trim_length from the read_length.
+
+    filename_prefix : str (default: `splici`)
+        The file name prefix of the generated output files.
+        The derived flank length will be automatically
+        appended to the provided prefix.
+
+    extra_spliced : str
+        A path to a fasta file. The records in this fasta file will be
+        regarded as spliced transcripts.
+
+    extra_unspliced : str
+        The path to a fasta file. The records in this fasta file will be
+        regarded as introns.
+
+    dedup_seqs : bool  (default: `False`)
+        If True, the repeated sequences in the splici reference will be
+        deduplicated.
+
+    no_bt : bool (default: `False`)
+        If true, biopython, instead of bedtools, will be used for
+        generating splici reference files.
+
+    bt_path : str
+        The path to bedtools v2.30.0 or greater if it is not in the environment PATH.
+
+    no_flanking_merge : bool (default: `False`)
+        If true, overlapping introns caused by the added flanking length will not be merged.
+
+    write_clean_gtf : bool (default: `False`)
+        If true, when the input GTF contains invalid records, a clean GTF
+        file `clean_gtf.gtf` with these invalid records removed will be
+        exported to the output dir.
+        An invalid record is an exon record without an transcript ID
+        in the `transcript_id` field.
+
+
+    Returns
+    -------
+    Nothing will be returned. The splici reference files will be written
+    to disk.
+
+    Notes
+    -----
+    * If using bedtools, a temp.bed and a temp.fa will be created and
+        then deleted. These two files encode the introns of each gene
+        and the exons of each transcript of each gene.
+
+    """
+
+    import pyranges as pr
+
+    import warnings
+
+    import os
+
+    import subprocess
+    import shutil
+
+    from packaging.version import parse as parse_version
+
+    from Bio import SeqIO
+    from Bio.SeqIO.FastaIO import SimpleFastaParser
+
+    # Preparation
+
+    # check flanking length
+    flank_length = read_length - flank_trim_length
+
+    if flank_length < 0:
+        raise ValueError("Flank trim length cannot be larger than read length!")
+
+    # check fasta file
+    if not os.path.isfile(genome_path):
+        raise IOError("Cannot open the input fasta file!")
+
+    # check gtf file
+    if not os.path.isfile(gtf_path):
+        raise IOError("Cannot open the input gtf file!")
+
+    def check_bedtools_version(bt_check_path):
+        try:
+            vstr = (
+                subprocess.run([bt_path, "--version"], capture_output=True)
+                .stdout.decode()
+                .strip()
+                .split("v")[1]
+            )
+            found_ver = parse_version(vstr)
+            req_ver = parse_version("2.30.0")
+            return found_ver >= req_ver
+        except subprocess.CalledProcessError as err:
+            # in this case couldn't even run subprocess
+            warnings.warn(f"Cannot check bedtools version.\n{err}")
+            return False
+
+    # check bedtools
+    if not no_bt:
+        # check at the provided path
+        if not check_bedtools_version(bt_path):
+            # if it's not ok at the provided path, check
+            # the standard system path
+            if bt_path == "bedtools":
+                # in this case, there's nowhere else to check
+                # so give up on bedtools
+                print(
+                    "bedtools in the environemnt PATH is either",
+                    "older than v.2.30.0 or doesn't exist.",
+                    "\nBiopython will be used.",
+                )
+                no_bt = True
+            else:
+                print(
+                    "bedtools specified by bt_path is either",
+                    "older than v.2.30.0 or doesn't exist.",
+                    "\nTry finding bedtools in the environmental PATH.",
+                )
+                # if it's not ok at the standard system path
+                # fallback to biopython
+                if not check_bedtools_version("bedtools"):
+                    print(
+                        "bedtools in the environemnt PATH is either",
+                        "older than v.2.30.0 or doesn't exist.",
+                        "\nBiopython will be used.",
+                    )
+                    no_bt = True
+                # found it at the system path
+                else:
+                    bt_path = "bedtools"
+                    print("Using bedtools in the environmental PATH.")
+
+    # create out folder and temp folder inside
+    # create output folder
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    # specify output file names
+    filename_prefix = filename_prefix + "_fl" + str(flank_length)
+    out_fa = os.path.join(output_dir, filename_prefix + ".fa")
+    out_t2g3col = os.path.join(output_dir, filename_prefix + "_t2g_3col.tsv")
+    id2name_path = os.path.join(output_dir, "gene_id_to_name.tsv")
+
+    # load gtf
+    try:
+        gr = pr.read_gtf(gtf_path)
+    except ValueError:
+        # in this case couldn't even run subprocess
+        raise RuntimeError(
+            "PyRanges failed to parse the input GTF file. Please check the PyRanges documentation for the expected GTF format constraints.\nhttps://pyranges.readthedocs.io/en/latest/autoapi/pyranges/readers/index.html?highlight=read_gtf#pyranges.readers.read_gtf"
+        )
+
+    gr = pr.read_gtf(gtf_path)
+
+    # check the validity of gr
+    gr = check_gr(gr, output_dir, write_clean_gtf)
+
+    # write gene id to name tsv file
+    gr.df[["gene_id", "gene_name"]].drop_duplicates().to_csv(
+        id2name_path, sep="\t", header=False, index=False
+    )
+
+    # get introns
+    # the introns() function uses inplace=True argument from pandas,
+    # which will trigger an FutureWarning.
+    warnings.simplefilter(action="ignore", category=FutureWarning)
+    introns = gr.features.introns(by="transcript")
+    introns.Name = introns.gene_id
+
+    if no_flanking_merge:
+        introns = introns.merge(strand=True, by=["Name"], slack=0)
+
+    introns = introns.extend(flank_length)
+
+    if not no_flanking_merge:
+        introns = introns.merge(strand=True, by=["Name"], slack=0)
+
+    introns.Gene = introns.Name
+    introns.Name = [
+        "-I".join(map(str, z))
+        for z in zip(
+            introns.Name,
+            introns.Name.groupby(introns.Name)
+            .cumcount()
+            .astype(str)
+            .replace(
+                "0",
+                "",
+            )
+            .values,
+        )
+    ]
+
+    # trim outbounded introns
+    with open(genome_path) as fasta_file:
+        chromsize = {
+            title.split()[0]: len(sequence)
+            for title, sequence in SimpleFastaParser(fasta_file)
+        }
+    introns = pr.gf.genome_bounds(introns, chromsize, clip=True)
+
+    # deduplicate introns
+    if dedup_seqs:
+        introns.drop_duplicate_positions()
+
+    # add splice status for introns
+    introns.splice_status = "U"
+
+    # get exons
+    exons = gr[gr.Feature == "exon"]
+
+    exons.Name = exons.transcript_id
+    exons.Gene = exons.gene_id
+    exons = exons.drop(exons.columns[~exons.columns.isin(introns.columns)].tolist())
+    exons = exons.sort(["Name", "Start", "End"])
+    # add splice status for exons
+    exons.splice_status = "S"
+
+    # concat spliced transcripts and introns as splici
+    splici = pr.concat([exons, introns])
+
+    # splici = splici.sort(["Name", "Start", "End", "Gene"])
+
+    # write to files
+    # t2g_3col.tsv
+    splici.df[["Name", "Gene", "splice_status"]].drop_duplicates().to_csv(
+        out_t2g3col, sep="\t", header=False, index=False
+    )
+    # print(splici.head())
+    tid2strand = dict(zip(splici.Name, splici.Strand))
+
+    # splici fasta
+    if not no_bt:
+        try:
+
+            # create temp folder
+            temp_dir = os.path.join(output_dir, "temp")
+            if not os.path.exists(temp_dir):
+                os.makedirs(temp_dir)
+            temp_fa = os.path.join(temp_dir, "temp.fa")
+            temp_bed = os.path.join(temp_dir, "temp.bed")
+
+            # write bed file
+            splici.to_bed(temp_bed, keep=True)
+
+            # run bedtools, ignore strand for now
+            bt_r = subprocess.run(
+                " ".join(
+                    [
+                        bt_path,
+                        "getfasta",
+                        "-fi",
+                        genome_path,
+                        "-fo",
+                        temp_fa,
+                        "-bed",
+                        temp_bed,
+                        # "-s",
+                        "-nameOnly",
+                    ]
+                ),
+                shell=True,
+                capture_output=True,
+            )
+
+            # check return code
+            if bt_r.returncode != 0:
+                raise ValueError("Bedtools failed.")
+
+            # parse temp fasta file to concat exons of each transcript
+            ei_parser = SeqIO.parse(temp_fa, "fasta")
+            prev_rec = next(ei_parser)
+            # prev_rec.id = prev_rec.id.split("(")[0]
+            prev_rec.description = ""
+            with open(out_fa, "w") as out_handle:
+
+                for seq_record in ei_parser:
+                    # seq_record.id = seq_record.id.split("(")[0]
+                    seq_record.description = ""
+                    if seq_record.id == prev_rec.id:
+                        prev_rec += seq_record
+
+                    else:
+                        if tid2strand[prev_rec.id] == "-":
+                            prev_rec = prev_rec.reverse_complement(
+                                id=True, description=True
+                            )
+                        SeqIO.write(prev_rec, out_handle, "fasta")
+                        prev_rec = seq_record
+                # Don't forget our last customer
+                if tid2strand[prev_rec.id] == "-":
+                    prev_rec = prev_rec.reverse_complement(id=True, description=True)
+                SeqIO.write(prev_rec, out_handle, "fasta")
+            shutil.rmtree(temp_dir, ignore_errors=True)
+        except subprocess.CalledProcessError as err:
+            no_bt = True
+            warnings.warn(f"Bedtools failed. Use biopython instead.\n{err}")
+            shutil.rmtree(temp_dir, ignore_errors=True)
+
+    if no_bt:
+
+        from Bio.Seq import Seq
+        from Bio.SeqFeature import SeqFeature, FeatureLocation
+
+        from Bio.SeqRecord import SeqRecord
+
+        with open(out_fa, "w") as out_handle:
+            # read fasta, process a chromosome at a time
+            for seq_record in SeqIO.parse(genome_path, "fasta"):
+                # get all records on that chromosome
+                chr_records = introns[introns.Chromosome == seq_record.id].df
+                if not chr_records.empty:
+                    chr_records.Strand = chr_records.Strand.replace(
+                        ["+", "-"], [+1, -1]
+                    )
+                    # init seq list
+                    intron_seqs = []
+                    # for each intron record
+                    for (idx, intron_record) in chr_records.iterrows():
+                        # create Seqeture object for extracting sequence from chromosome
+                        intron_feature = SeqFeature(
+                            FeatureLocation(intron_record.Start, intron_record.End),
+                            type="intron",
+                            strand=intron_record.Strand,
+                            id=intron_record.Name,
+                        )
+                        # append the intron sequence to the seq list, specify name as well
+                        intron_seqs.append(
+                            SeqRecord(
+                                intron_feature.extract(seq_record).seq,
+                                id=intron_record.Name,
+                                description="",
+                            )
+                        )
+                    # finally, write all intron sequences at once.
+                    SeqIO.write(intron_seqs, out_handle, "fasta")
+
+                # Then, process spliced transcripts
+                chr_records = exons[exons.Chromosome == seq_record.id].df
+                if not chr_records.empty:
+                    txp_seqs = []
+                    # as spliced txps are the concat of all exon sequences, fist get the sequence of each exon separately,then sum them up.
+                    for (tid, exon_records) in chr_records.groupby("Name"):
+
+                        # init exon seq list
+                        exon_seqs = []
+                        # get the sequence of each exon
+                        for (idx, exon_record) in exon_records.iterrows():
+                            # create SeqFeature object for the exon record
+                            # ignore strand for now, get reverse complement later if needed
+                            exon_feature = SeqFeature(
+                                FeatureLocation(exon_record.Start, exon_record.End),
+                                type="exon",
+                            )
+                            # extract exon sequence from chromosome and append to exon seq list
+                            exon_seqs.append(
+                                SeqRecord(
+                                    exon_feature.extract(seq_record).seq,
+                                    id=tid,
+                                    description="",
+                                )
+                            )
+                        # append the txp sequence to spliced txp seq list
+                        # consider strand
+                        if tid2strand[tid] == "-":
+                            txp_seqs.append(
+                                sum(exon_seqs, Seq("")).reverse_complement(
+                                    id=True, description=True
+                                )
+                            )
+                        else:
+                            txp_seqs.append(sum(exon_seqs, Seq("")))
+                    # write all spliced transcript serquence at once.
+                    SeqIO.write(txp_seqs, out_handle, "fasta")
+
+    # append extra spliced transcript onto splici
+    if extra_spliced is not None:
+        append_extra(extra_spliced, out_fa, out_t2g3col, id2name_path, "S")
+
+    # append extra unspliced transcript onto splici
+    if extra_unspliced is not None:
+        append_extra(extra_unspliced, out_fa, out_t2g3col, id2name_path, "U")
+
+    if dedup_seqs:
+        # Note: out_fa is intentionally passed as both
+        # the input and output file name parameters because
+        # we want to overwirte the duplicate fasta with
+        # the deduplicated fasta.
+        dedup_sequences(output_dir, out_fa, out_fa)
+
+def make_spliceu_txome(
+    genome_path,
+    gtf_path,
+    output_dir,
+    filename_prefix,
+    extra_spliced=None,
+    extra_unspliced=None,
+    dedup_seqs=False,
+    no_bt=False,
+    bt_path="bedtools",
+    write_clean_gtf=False,
+):
+    """
+    Construct the spliceu (spliced + unspliced) transcriptome for alevin-fry.
+
+    Required Parameters
+    ----------
+    genome_path : str
+        The path to a genome fasta file.
+
+    gtf_path : str
+        The path to a gtf file.
+
+    output_dir : str
+        The output directory, where the spliceu reference files will
+        be written.
+
+    Optional Parameters
+    ----------
+
+    filename_prefix : str (default: `spliceu`)
+        The file name prefix of the generated output files.
+        The derived flank length will be automatically
+        appended to the provided prefix.
+
+    extra_spliced : str
+        A path to a fasta file. The records in this fasta file will be
+        regarded as spliced transcripts.
+
+    extra_unspliced : str
+        The path to a fasta file. The records in this fasta file will be
+        regarded as introns.
+
+    dedup_seqs : bool  (default: `False`)
+        If True, the repeated sequences in the spliceu reference will be
+        deduplicated.
+
+    no_bt : bool (default: `False`)
+        If true, biopython, instead of bedtools, will be used for
+        generating spliceu reference files.
+
+    bt_path : str
+        The path to bedtools v2.30.0 or greater if it is not in the environment PATH.
+
+    write_clean_gtf : bool (default: `False`)
+        If true, when the input GTF contains invalid records, a clean GTF
+        file `clean_gtf.gtf` with these invalid records removed will be
+        exported to the output dir.
+        An invalid record is an exon record without an transcript ID
+        in the `transcript_id` field.
+
+
+    Returns
+    -------
+    Nothing will be returned. The spliceu reference files will be written
+    to disk.
+
+    Notes
+    -----
+    * If using bedtools, a temp.bed and a temp.fa will be created and
+        then deleted. These two files encode the introns of each gene
+        and the exons of each transcript of each gene.
+
+    """
+
+    import pyranges as pr
+
+    import warnings
+
+    import os
+
+    import subprocess
+    import shutil
+
+    from packaging.version import parse as parse_version
+
+    from Bio import SeqIO
+    from Bio.SeqIO.FastaIO import SimpleFastaParser
+
+    # Preparation
+    # check fasta file
+    if not os.path.isfile(genome_path):
+        raise IOError("Cannot open the input fasta file!")
+
+    # check gtf file
+    if not os.path.isfile(gtf_path):
+        raise IOError("Cannot open the input gtf file!")
+
+    def check_bedtools_version(bt_check_path):
+        try:
+            vstr = (
+                subprocess.run([bt_path, "--version"], capture_output=True)
+                .stdout.decode()
+                .strip()
+                .split("v")[1]
+            )
+            found_ver = parse_version(vstr)
+            req_ver = parse_version("2.30.0")
+            return found_ver >= req_ver
+        except subprocess.CalledProcessError as err:
+            # in this case couldn't even run subprocess
+            warnings.warn(f"Cannot check bedtools version.\n{err}")
+            return False
+
+    # check bedtools
+    if not no_bt:
+        # check at the provided path
+        if not check_bedtools_version(bt_path):
+            # if it's not ok at the provided path, check
+            # the standard system path
+            if bt_path == "bedtools":
+                # in this case, there's nowhere else to check
+                # so give up on bedtools
+                print(
+                    "bedtools in the environemnt PATH is either",
+                    "older than v.2.30.0 or doesn't exist.",
+                    "\nBiopython will be used.",
+                )
+                no_bt = True
+            else:
+                print(
+                    "bedtools specified by bt_path is either",
+                    "older than v.2.30.0 or doesn't exist.",
+                    "\nTry finding bedtools in the environmental PATH.",
+                )
+                # if it's not ok at the standard system path
+                # fallback to biopython
+                if not check_bedtools_version("bedtools"):
+                    print(
+                        "bedtools in the environemnt PATH is either",
+                        "older than v.2.30.0 or doesn't exist.",
+                        "\nBiopython will be used.",
+                    )
+                    no_bt = True
+                # found it at the system path
+                else:
+                    bt_path = "bedtools"
+                    print("Using bedtools in the environmental PATH.")
+
+    # create out folder and temp folder inside
+    # create output folder
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    # specify output file names
+    out_fa = os.path.join(output_dir, filename_prefix + ".fa")
+    out_t2g3col = os.path.join(output_dir, filename_prefix + "_t2g_3col.tsv")
+    out_t2g = os.path.join(output_dir, filename_prefix + "_t2g.tsv")
+    out_g2g = os.path.join(output_dir, filename_prefix + "_g2g.tsv")
+    id2name_path = os.path.join(output_dir, "gene_id_to_name.tsv")
+
+    # load gtf
+    try:
+        gr = pr.read_gtf(gtf_path)
+    except ValueError:
+        # in this case couldn't even run subprocess
+        raise RuntimeError(
+            "PyRanges failed to parse the input GTF file. Please check the PyRanges documentation for the expected GTF format constraints.\nhttps://pyranges.readthedocs.io/en/latest/autoapi/pyranges/readers/index.html?highlight=read_gtf#pyranges.readers.read_gtf"
+        )
+
+    # check the validity of gr
+    gr = check_gr(gr, output_dir, write_clean_gtf)
+
+    # write gene id to name tsv file
+    gr.df[["gene_id", "gene_name"]].drop_duplicates().to_csv(
+        id2name_path, sep="\t", header=False, index=False
+    )
+
+    # get unspliced
+    unspliced = gr.boundaries("gene_id")
+    unspliced.Name = unspliced.gene_id + "-I"
+    unspliced.Gene = unspliced.gene_id
+
+    # add splice status for unspliced
+    unspliced.splice_status = "U"
+
+    # get exons
+    exons = gr[gr.Feature == "exon"]
+
+    exons.Name = exons.transcript_id
+    exons.Gene = exons.gene_id
+    exons = exons.drop(exons.columns[~exons.columns.isin(unspliced.columns)].tolist())
+    exons = exons.sort(["Name", "Start", "End"])
+    # add splice status for exons
+    exons.splice_status = "S"
+
+    # concat spliced transcripts and unspliced as spliceu
+    spliceu = pr.concat([exons, unspliced])
+
+    # write to files
+    # t2g_3col.tsv
+    t2g_3col = spliceu.df[["Name", "Gene", "splice_status"]].drop_duplicates()
+    t2g_3col.to_csv(
+        out_t2g3col, sep="\t", header=False, index=False
+    )
+    
+    # t2g.csv
+    t2g_3col[["Name", "Gene"]].to_csv(
+        out_t2g, sep="\t", header=False, index=False
+    )
+    
+    # g2g.csv
+    t2g_3col[["Gene", "Gene"]].to_csv(
+        out_g2g, sep="\t", header=False, index=False
+    )
+    
+    # print(spliceu.head())
+    tid2strand = dict(zip(spliceu.Name, spliceu.Strand))
+
+    # spliceu fasta
+    if not no_bt:
+        try:
+            # create temp folder
+            temp_dir = os.path.join(output_dir, "temp")
+            if not os.path.exists(temp_dir):
+                os.makedirs(temp_dir)
+            temp_fa = os.path.join(temp_dir, "temp.fa")
+            temp_bed = os.path.join(temp_dir, "temp.bed")
+
+            # write bed file
+            spliceu.to_bed(temp_bed, keep=True)
+
+            # run bedtools, ignore strand for now
+            bt_r = subprocess.run(
+                " ".join(
+                    [
+                        bt_path,
+                        "getfasta",
+                        "-fi",
+                        genome_path,
+                        "-fo",
+                        temp_fa,
+                        "-bed",
+                        temp_bed,
+                        # "-s",
+                        "-nameOnly",
+                    ]
+                ),
+                shell=True,
+                capture_output=True,
+            )
+
+            # check return code
+            if bt_r.returncode != 0:
+                raise ValueError("Bedtools failed.")
+
+            # parse temp fasta file to concat exons of each transcript
+            ei_parser = SeqIO.parse(temp_fa, "fasta")
+            prev_rec = next(ei_parser)
+            # prev_rec.id = prev_rec.id.split("(")[0]
+            prev_rec.description = ""
+            with open(out_fa, "w") as out_handle:
+
+                for seq_record in ei_parser:
+                    # seq_record.id = seq_record.id.split("(")[0]
+                    seq_record.description = ""
+                    if seq_record.id == prev_rec.id:
+                        prev_rec += seq_record
+
+                    else:
+                        if tid2strand[prev_rec.id] == "-":
+                            prev_rec = prev_rec.reverse_complement(
+                                id=True, description=True
+                            )
+                        SeqIO.write(prev_rec, out_handle, "fasta")
+                        prev_rec = seq_record
+                # Don't forget our last customer
+                if tid2strand[prev_rec.id] == "-":
+                    prev_rec = prev_rec.reverse_complement(id=True, description=True)
+                SeqIO.write(prev_rec, out_handle, "fasta")
+            shutil.rmtree(temp_dir, ignore_errors=True)
+        except subprocess.CalledProcessError as err:
+            no_bt = True
+            warnings.warn(f"Bedtools failed. Use biopython instead.\n{err}")
+            shutil.rmtree(temp_dir, ignore_errors=True)
+
+    if no_bt:
+
+        from Bio.Seq import Seq
+        from Bio.SeqFeature import SeqFeature, FeatureLocation
+
+        from Bio.SeqRecord import SeqRecord
+
+        with open(out_fa, "w") as out_handle:
+            # read fasta, process a chromosome at a time
+            for seq_record in SeqIO.parse(genome_path, "fasta"):
+                # get all records on that chromosome
+                chr_records = unspliced[unspliced.Chromosome == seq_record.id].df
+                if not chr_records.empty:
+                    chr_records.Strand = chr_records.Strand.replace(
+                        ["+", "-"], [+1, -1]
+                    )
+                    # init seq list
+                    unspliced_seqs = []
+                    # for each unspliced record
+                    for (idx, unspliced_record) in chr_records.iterrows():
+                        # create Seqeture object for extracting sequence from chromosome
+                        unspliced_feature = SeqFeature(
+                            FeatureLocation(unspliced_record.Start, unspliced_record.End),
+                            type="unspliced",
+                            id=unspliced_record.Name,
+                        )
+                        unspliced_feature.strand=unspliced_record.Strand
+                        # append the unspliced sequence to the seq list, specify name as well
+                        unspliced_seqs.append(
+                            SeqRecord(
+                                unspliced_feature.extract(seq_record).seq,
+                                id=unspliced_record.Name,
+                                description="",
+                            )
+                        )
+                    # finally, write all unspliced sequences at once.
+                    SeqIO.write(unspliced_seqs, out_handle, "fasta")
+
+                # Then, process spliced transcripts
+                chr_records = exons[exons.Chromosome == seq_record.id].df
+                if not chr_records.empty:
+                    txp_seqs = []
+                    # as spliced txps are the concat of all exon sequences, fist get the sequence of each exon separately,then sum them up.
+                    for (tid, exon_records) in chr_records.groupby("Name"):
+
+                        # init exon seq list
+                        exon_seqs = []
+                        # get the sequence of each exon
+                        for (idx, exon_record) in exon_records.iterrows():
+                            # create SeqFeature object for the exon record
+                            # ignore strand for now, get reverse complement later if needed
+                            exon_feature = SeqFeature(
+                                FeatureLocation(exon_record.Start, exon_record.End),
+                                type="exon",
+                            )
+                            # extract exon sequence from chromosome and append to exon seq list
+                            exon_seqs.append(
+                                SeqRecord(
+                                    exon_feature.extract(seq_record).seq,
+                                    id=tid,
+                                    description="",
+                                )
+                            )
+                        # append the txp sequence to spliced txp seq list
+                        # consider strand
+                        if tid2strand[tid] == "-":
+                            txp_seqs.append(
+                                sum(exon_seqs, Seq("")).reverse_complement(
+                                    id=True, description=True
+                                )
+                            )
+                        else:
+                            txp_seqs.append(sum(exon_seqs, Seq("")))
+                    # write all spliced transcript serquence at once.
+                    SeqIO.write(txp_seqs, out_handle, "fasta")
+
+    # append extra spliced transcript onto spliceu
+    if extra_spliced is not None:
+        append_extra(extra_spliced, out_fa, out_t2g3col, id2name_path, "S")
+
+    # append extra unspliced transcript onto spliceu
+    if extra_unspliced is not None:
+        append_extra(extra_unspliced, out_fa, out_t2g3col, id2name_path, "U")
+
+    if dedup_seqs:
+        # Note: out_fa is intentionally passed as both
+        # the input and output file name parameters because
+        # we want to overwirte the duplicate fasta with
+        # the deduplicated fasta.
+        dedup_sequences(output_dir, out_fa, out_fa)

From 4a9b7e03a058ee0427a6f007d20fb65bfc281cf5 Mon Sep 17 00:00:00 2001
From: DongzeHe <dongzehe.zaza@gmail.com>
Date: Fri, 6 Jan 2023 17:04:41 -0500
Subject: [PATCH 2/6] update readme

---
 README.md | 51 ++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 46 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 60586ed..594eb89 100644
--- a/README.md
+++ b/README.md
@@ -13,20 +13,20 @@ The `pyroe` package provides useful functions for analyzing single-cell or singl
 ## Installation
 The `pyroe` package can be accessed from its [github repository](https://github.com/COMBINE-lab/pyroe), installed via [`pip`](https://pip.pypa.io/en/stable/). To install the `pyroe` package via `pip` use the command:
 
-```
+```sh
 pip install pyroe
 ```
 
 To make use of the `load_fry` function (which, itself, installs [scanpy](https://scanpy.readthedocs.io/en/stable/)), you should also be sure to install the package with the `scanpy` extra:
 
-```
+```sh
 pip install pyroe[scanpy]
 ```
 
 Alternatively, `pyroe` can be installed via `bioconda`, which will automatically install the variant of the package including `load_fry`, and will
 also install `bedtools` to enable faster construction of the *splici* reference (see below).  This installation can be performed with the command:
 
-```
+```sh
 conda install pyroe
 ```
 
@@ -39,7 +39,7 @@ The USA mode in alevin-fry requires a special index reference, which is called t
 
 Following is an example of calling the `pyroe` to make the *splici* index reference. The final flank length is calculated as the difference between the read length and the flank_trim_length, i.e., 5-2=3. This function allows you to add extra spliced and unspliced sequences to the *splici* index, which will be useful when some unannotated sequences, such as mitochondrial genes, are important for your experiment. **Note** : to make `pyroe` work more quickly, it is recommended to have the latest version of [`bedtools`](https://bedtools.readthedocs.io/en/latest/) ([Aaron R. Quinlan and Ira M. Hall, 2010](https://doi.org/10.1093/bioinformatics/btq033)) installed.
 
-```
+```sh
 pyroe make-splici extdata/small_example_genome.fa extdata/small_example.gtf 5 splici_txome \
       --flank-trim-length 2 --filename-prefix transcriptome_splici --dedup-seqs
 ```
@@ -90,6 +90,47 @@ optional arguments:
 
 The *splici* index of a given species consists of the transcriptome of the species, i.e., the spliced transcripts, and the intronic sequences of the species. Within a gene, if the flanked intronic sequences overlap with each other, the overlapped intronic sequences will be collapsed as a single intronic sequence to make sure each base will appear only once in the intronic sequences. For more detailed information, please check the section S2 in the supplementary file of [alevin-fry manuscript](https://www.biorxiv.org/content/10.1101/2021.06.29.450377v2).
 
+## Prepare spliceu index for quantification with alevin-fry
+
+Recently, [He et al.](https://www.biorxiv.org/content/10.1101/2023.01.04.522742v1) introduced the <ins>*splice*</ins>d+<ins>*u*</ins>nspliced (_spliceu_) index in alevin-fry. This requires the _spliceu_ transcriptome. The command of making an *spliceu* transcriptome reference is similar to making a _splici_ reference:
+
+```sh
+pyroe make-spliceu extdata/small_example_genome.fa extdata/small_example.gtf spliceu_txome \
+--filename-prefix transcriptome_spliceu
+```
+
+### Full usage
+```
+usage: pyroe make-spliceu [-h] [--filename-prefix FILENAME_PREFIX]
+                          [--extra-spliced EXTRA_SPLICED]
+                          [--extra-unspliced EXTRA_UNSPLICED]
+                          [--bt-path BT_PATH] [--no-bt] [--dedup-seqs]
+                          [--write-clean-gtf]
+                          genome-path gtf-path output-dir
+
+positional arguments:
+  genome-path           The path to a genome fasta file.
+  gtf-path              The path to a gtf file.
+  output-dir            The output directory where Spliceu reference files
+                        will be written.
+
+options:
+  -h, --help            show this help message and exit
+  --filename-prefix FILENAME_PREFIX
+                        The file name prefix of the generated output files.
+  --extra-spliced EXTRA_SPLICED
+                        The path to an extra spliced sequence fasta file.
+  --extra-unspliced EXTRA_UNSPLICED
+                        The path to an extra unspliced sequence fasta file.
+  --bt-path BT_PATH     The path to bedtools v2.30.0 or greater.
+  --no-bt               A flag indicates whether bedtools will be used for
+                        generating Spliceu reference files.
+  --dedup-seqs          A flag indicates whether identical sequences will be
+                        deduplicated.
+  --write-clean-gtf     A flag indicates whether a clean gtf will be written
+                        if encountered invalid records.
+```
+
 ## Processing alevin-fry quantification result
 
 The quantification result of alevin-fry can be loaded into python by the `load_fry()` function. This function takes a output directory returned by `alevin-fry quant` command as the minimum input, and load the quantification result as an `AnnData` object. When processing USA mode result, it assumes that the data comes from a single-cell RNA-sequencing experiment. If one wants to process single-nucleus RNA-sequencing data or prepare the single-cell data for RNA-velocity analysis, the `output_format` argument should be set as `snRNA` or `velocity` correspondingly. One can also define customized output format, see the Full Usage section for detail.
@@ -176,7 +217,7 @@ We provide two python functions:
 - `load_processed_quant()` can fetch the quantification result of one or more available dataset as `fetch_processed_quant()`, and load them into python as `AnnData` objects. We also provide a CLI for fetching quantification results.
 
 
-```bash
+```sh
 pyroe fetch-quant 1 3 6
 ```
 

From a7b6d5b0c38da48a25501821b152b2848ff7a7c7 Mon Sep 17 00:00:00 2001
From: DongzeHe <dongzehe.zaza@gmail.com>
Date: Fri, 6 Jan 2023 17:14:50 -0500
Subject: [PATCH 3/6] make flake8 happy

---
 src/pyroe/make_txome.py | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/src/pyroe/make_txome.py b/src/pyroe/make_txome.py
index 58dac41..a26cc74 100644
--- a/src/pyroe/make_txome.py
+++ b/src/pyroe/make_txome.py
@@ -649,6 +649,7 @@ def check_bedtools_version(bt_check_path):
         # the deduplicated fasta.
         dedup_sequences(output_dir, out_fa, out_fa)
 
+
 def make_spliceu_txome(
     genome_path,
     gtf_path,
@@ -736,7 +737,8 @@ def make_spliceu_txome(
     from packaging.version import parse as parse_version
 
     from Bio import SeqIO
-    from Bio.SeqIO.FastaIO import SimpleFastaParser
+
+    # from Bio.SeqIO.FastaIO import SimpleFastaParser
 
     # Preparation
     # check fasta file
@@ -851,20 +853,14 @@ def check_bedtools_version(bt_check_path):
     # write to files
     # t2g_3col.tsv
     t2g_3col = spliceu.df[["Name", "Gene", "splice_status"]].drop_duplicates()
-    t2g_3col.to_csv(
-        out_t2g3col, sep="\t", header=False, index=False
-    )
-    
+    t2g_3col.to_csv(out_t2g3col, sep="\t", header=False, index=False)
+
     # t2g.csv
-    t2g_3col[["Name", "Gene"]].to_csv(
-        out_t2g, sep="\t", header=False, index=False
-    )
-    
+    t2g_3col[["Name", "Gene"]].to_csv(out_t2g, sep="\t", header=False, index=False)
+
     # g2g.csv
-    t2g_3col[["Gene", "Gene"]].to_csv(
-        out_g2g, sep="\t", header=False, index=False
-    )
-    
+    t2g_3col[["Gene", "Gene"]].to_csv(out_g2g, sep="\t", header=False, index=False)
+
     # print(spliceu.head())
     tid2strand = dict(zip(spliceu.Name, spliceu.Strand))
 
@@ -957,11 +953,13 @@ def check_bedtools_version(bt_check_path):
                     for (idx, unspliced_record) in chr_records.iterrows():
                         # create Seqeture object for extracting sequence from chromosome
                         unspliced_feature = SeqFeature(
-                            FeatureLocation(unspliced_record.Start, unspliced_record.End),
+                            FeatureLocation(
+                                unspliced_record.Start, unspliced_record.End
+                            ),
                             type="unspliced",
                             id=unspliced_record.Name,
                         )
-                        unspliced_feature.strand=unspliced_record.Strand
+                        unspliced_feature.strand = unspliced_record.Strand
                         # append the unspliced sequence to the seq list, specify name as well
                         unspliced_seqs.append(
                             SeqRecord(

From b8331c21efdd56653e030e0008070f1b9b420262 Mon Sep 17 00:00:00 2001
From: DongzeHe <dongzehe.zaza@gmail.com>
Date: Fri, 6 Jan 2023 18:29:49 -0500
Subject: [PATCH 4/6] write test function for make_spliceu

---
 src/pyroe/make_txome.py    |   3 +-
 tests/test_make_spliceu.py | 127 +++++++++++++++++++++++++++++++++++++
 tests/test_make_splici.py  |   2 +-
 3 files changed, 130 insertions(+), 2 deletions(-)
 create mode 100644 tests/test_make_spliceu.py

diff --git a/src/pyroe/make_txome.py b/src/pyroe/make_txome.py
index a26cc74..770ee8f 100644
--- a/src/pyroe/make_txome.py
+++ b/src/pyroe/make_txome.py
@@ -582,9 +582,10 @@ def check_bedtools_version(bt_check_path):
                         intron_feature = SeqFeature(
                             FeatureLocation(intron_record.Start, intron_record.End),
                             type="intron",
-                            strand=intron_record.Strand,
                             id=intron_record.Name,
                         )
+                        intron_feature.strand = intron_record.Strand
+
                         # append the intron sequence to the seq list, specify name as well
                         intron_seqs.append(
                             SeqRecord(
diff --git a/tests/test_make_spliceu.py b/tests/test_make_spliceu.py
new file mode 100644
index 0000000..8d39295
--- /dev/null
+++ b/tests/test_make_spliceu.py
@@ -0,0 +1,127 @@
+from pyroe.make_txome import make_spliceu_txome
+import tempfile
+import os
+import pandas as pd
+from Bio import SeqIO
+from Bio.Seq import Seq
+
+
+def test_make_spliceu():
+    location = os.path.dirname(os.path.realpath(__file__))
+    test_file_dir = os.path.join(location, "data", "test_make_splici")
+    genome_path = os.path.join(test_file_dir, "small_example_genome.fa")
+    gtf_path = os.path.join(test_file_dir, "small_example.gtf")
+    filename_prefix = "spliceu"
+    extra_spliced = os.path.join(test_file_dir, "extra_spliced.txt")
+    extra_unspliced = os.path.join(test_file_dir, "extra_unspliced.txt")
+    output_dir = tempfile.TemporaryDirectory()
+    make_spliceu_txome(
+        genome_path=genome_path,
+        gtf_path=gtf_path,
+        extra_spliced=extra_spliced,
+        extra_unspliced=extra_unspliced,
+        output_dir=output_dir.name,
+        filename_prefix=filename_prefix,
+        no_bt=True,
+    )
+
+    out_fa = os.path.join(output_dir.name, "spliceu.fa")
+    out_t2g = os.path.join(output_dir.name, "spliceu_t2g_3col.tsv")
+    genome = SeqIO.to_dict(SeqIO.parse(genome_path, "fasta"))
+    # gene_annotation = pr.read_gtf(gtf_path).df
+    spliceu_seqs = SeqIO.to_dict(SeqIO.parse(out_fa, "fasta"))
+    t2g_colnames = ["txp_name", "gene_name", "splice_status"]
+    t2g_3col = pd.read_csv(
+        out_t2g, sep="\t", header=None, names=t2g_colnames
+    ).set_index(t2g_colnames[0])
+
+    # Define expected sequences
+    # For each gene,
+    # we have 3 spliced txps. Indexing starting from 1
+    # tx1 ([1,2], [36, 45], [71,80])
+    # tx2 ([46,55], [91, 100])
+    # txp3 ([121,130], [156, 160], [191,200])
+    # unspliced [1,200]
+
+    chr_1_seq = genome["chr1"].seq
+    chr_2_seq = genome["chr2"].seq
+    # check spliced sequences
+    # tx1_1
+    extracted_tx1_1 = spliceu_seqs["tx1.1"].seq
+    expected_tx1_1 = sum([chr_1_seq[0:2], chr_1_seq[35:45], chr_1_seq[70:80]], Seq(""))
+    assert extracted_tx1_1 == expected_tx1_1
+
+    # tx1_2
+    extracted_tx1_2 = spliceu_seqs["tx1.2"].seq
+    expected_tx1_2 = sum([chr_1_seq[45:55], chr_1_seq[90:100]], Seq(""))
+    assert extracted_tx1_2 == expected_tx1_2
+
+    # tx1_3
+    extracted_tx1_3 = spliceu_seqs["tx1.3"].seq
+    expected_tx1_3 = sum(
+        [chr_1_seq[120:130], chr_1_seq[155:160], chr_1_seq[190:200]], Seq("")
+    )
+    assert extracted_tx1_3 == expected_tx1_3
+
+    # tx2_1
+    extracted_tx2_1 = spliceu_seqs["tx2.1"].seq
+    expected_tx2_1 = sum(
+        [chr_2_seq[0:2], chr_2_seq[35:45], chr_2_seq[70:80]], Seq("")
+    ).reverse_complement()
+    assert extracted_tx2_1 == expected_tx2_1
+
+    # tx2_2
+    extracted_tx2_2 = spliceu_seqs["tx2.2"].seq
+    expected_tx2_2 = sum(
+        [chr_2_seq[45:55], chr_2_seq[90:100]], Seq("")
+    ).reverse_complement()
+    assert extracted_tx2_2 == expected_tx2_2
+
+    # tx2_3
+    extracted_tx2_3 = spliceu_seqs["tx2.3"].seq
+    expected_tx2_3 = sum(
+        [chr_2_seq[120:130], chr_2_seq[155:160], chr_2_seq[190:200]], Seq("")
+    ).reverse_complement()
+    assert extracted_tx2_3 == expected_tx2_3
+
+    # check intron seqs
+    # g1_I [0,200]
+
+    extract_g1_I = spliceu_seqs["g1-I"].seq
+    expected_g1_I = sum([chr_1_seq[0:200]], Seq(""))
+    assert extract_g1_I == expected_g1_I
+
+    # g2_I [0,200]
+
+    extract_g2_I = spliceu_seqs["g2-I"].seq
+    expected_g2_I = sum([chr_2_seq[0:200]], Seq("")).reverse_complement()
+    assert extract_g2_I == expected_g2_I
+
+    # check t2g_3col
+    t2g_dict = t2g_3col[[t2g_colnames[1]]].to_dict()["gene_name"]
+    t2s_dict = t2g_3col[[t2g_colnames[2]]].to_dict()["splice_status"]
+
+    # check txp name to gene name mapping
+    # for g1
+    assert t2g_dict["tx1.1"] == "g1"
+    assert t2g_dict["tx1.2"] == "g1"
+    assert t2g_dict["tx1.3"] == "g1"
+    assert t2g_dict["g1-I"] == "g1"
+
+    # for g2
+    assert t2g_dict["tx2.1"] == "g2"
+    assert t2g_dict["tx2.2"] == "g2"
+    assert t2g_dict["tx2.3"] == "g2"
+    assert t2g_dict["g2-I"] == "g2"
+
+    # check txp name to splice status mapping
+    assert t2s_dict["tx1.1"] == "S"
+    assert t2s_dict["tx1.2"] == "S"
+    assert t2s_dict["tx1.3"] == "S"
+    assert t2s_dict["g1-I"] == "U"
+
+    # for S
+    assert t2s_dict["tx2.1"] == "S"
+    assert t2s_dict["tx2.2"] == "S"
+    assert t2s_dict["tx2.3"] == "S"
+    assert t2s_dict["g2-I"] == "U"
diff --git a/tests/test_make_splici.py b/tests/test_make_splici.py
index 32798eb..21cb32f 100644
--- a/tests/test_make_splici.py
+++ b/tests/test_make_splici.py
@@ -1,4 +1,4 @@
-from pyroe.make_splici_txome import make_splici_txome
+from pyroe.make_txome import make_splici_txome
 import tempfile
 import os
 import pandas as pd

From e22b8c04d63ef13e160c065c54f27ffc701b468a Mon Sep 17 00:00:00 2001
From: DongzeHe <dongzehe.zaza@gmail.com>
Date: Fri, 6 Jan 2023 18:35:15 -0500
Subject: [PATCH 5/6] fix: minor fix

---
 tests/test_make_spliceu.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/test_make_spliceu.py b/tests/test_make_spliceu.py
index 8d39295..a168ff1 100644
--- a/tests/test_make_spliceu.py
+++ b/tests/test_make_spliceu.py
@@ -25,10 +25,9 @@ def test_make_spliceu():
         no_bt=True,
     )
 
-    out_fa = os.path.join(output_dir.name, "spliceu.fa")
-    out_t2g = os.path.join(output_dir.name, "spliceu_t2g_3col.tsv")
+    out_fa = os.path.join(output_dir.name, "_".join([filename_prefix, "spliceu.fa"]))
+    out_t2g = os.path.join(output_dir.name, "_".join([filename_prefix, "spliceu_t2g_3col.tsv"]))
     genome = SeqIO.to_dict(SeqIO.parse(genome_path, "fasta"))
-    # gene_annotation = pr.read_gtf(gtf_path).df
     spliceu_seqs = SeqIO.to_dict(SeqIO.parse(out_fa, "fasta"))
     t2g_colnames = ["txp_name", "gene_name", "splice_status"]
     t2g_3col = pd.read_csv(

From 24509936a59a95764bbbcb8a9f551df83504e240 Mon Sep 17 00:00:00 2001
From: DongzeHe <dongzehe.zaza@gmail.com>
Date: Fri, 6 Jan 2023 20:30:28 -0500
Subject: [PATCH 6/6] fix: minor fix

---
 tests/test_make_spliceu.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_make_spliceu.py b/tests/test_make_spliceu.py
index a168ff1..0da4eb5 100644
--- a/tests/test_make_spliceu.py
+++ b/tests/test_make_spliceu.py
@@ -25,8 +25,8 @@ def test_make_spliceu():
         no_bt=True,
     )
 
-    out_fa = os.path.join(output_dir.name, "_".join([filename_prefix, "spliceu.fa"]))
-    out_t2g = os.path.join(output_dir.name, "_".join([filename_prefix, "spliceu_t2g_3col.tsv"]))
+    out_fa = os.path.join(output_dir.name, "".join([filename_prefix, ".fa"]))
+    out_t2g = os.path.join(output_dir.name, "".join([filename_prefix, "_t2g_3col.tsv"]))
     genome = SeqIO.to_dict(SeqIO.parse(genome_path, "fasta"))
     spliceu_seqs = SeqIO.to_dict(SeqIO.parse(out_fa, "fasta"))
     t2g_colnames = ["txp_name", "gene_name", "splice_status"]