From 01c5f5578248ba13a30b71d4c1866b4481787daa Mon Sep 17 00:00:00 2001
From: Matt Shirley <matt_d.shirley@novartis.com>
Date: Thu, 5 Jan 2023 00:08:26 +0000
Subject: [PATCH 1/5] Use checklines=1 to avoid inconsistent dialect warnings

---
 pisces/index.py | 38 +++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/pisces/index.py b/pisces/index.py
index bdcf299..de27b3a 100755
--- a/pisces/index.py
+++ b/pisces/index.py
@@ -61,7 +61,7 @@ def build_index(args, unknown_args):
 
             transcripts_fasta_file = os.path.join(
                 index_dir_path, "transcripts", "transcripts.fa")
-                
+
             with open(transcripts_fasta_file, 'w') as transcripts_fasta:
                 ## all of this URI handling should probably use an existing library like
                 ## https://github.com/intake/filesystem_spec
@@ -104,12 +104,12 @@ def build_index(args, unknown_args):
                                         fasta.write(str(twobit[chrom]) + '\n')
                             reference = Fasta(
                                 _fasta_local_path.replace("2bit", "fa"))
-                    
+
                     with open(_fasta_local_path) as extra:
                         logging.info("Adding entries from %s", fasta)
                         for line in extra:
                             transcripts_fasta.write(line)
-                            
+
                 for gtf_loc, fasta_loc in zip(dataset["gtfs"],
                                               dataset["fastas"]):
                     gtf = urlparse(gtf_loc)
@@ -172,6 +172,7 @@ def build_index(args, unknown_args):
                                 db = gffutils.create_db(
                                     gtf.path,
                                     database_filename,
+                                    checklines=1,
                                     disable_infer_genes=
                                     not options["infer_features"],
                                     disable_infer_transcripts=
@@ -188,6 +189,7 @@ def build_index(args, unknown_args):
                                     db = gffutils.create_db(
                                         gtf.path,
                                         tmp_db,
+                                        checklines=1,
                                         disable_infer_genes=
                                         not options["infer_features"],
                                         disable_infer_transcripts=
@@ -220,6 +222,7 @@ def build_index(args, unknown_args):
                                             _gtf_local_path.replace(".gz", ""),
                                             _gtf_local_path.replace(
                                                 ".gz", "") + '.db',
+                                            checklines=1,
                                             disable_infer_genes=
                                             not options["infer_features"],
                                             disable_infer_transcripts=
@@ -231,6 +234,7 @@ def build_index(args, unknown_args):
                                         db = gffutils.create_db(
                                             _gtf_local_path,
                                             _gtf_local_path + '.db',
+                                            checklines=1,
                                             disable_infer_genes=
                                             not options["infer_features"],
                                             disable_infer_transcripts=
@@ -261,9 +265,9 @@ def build_index(args, unknown_args):
                     gene_annotation = os.path.join(
                         index_dir_path,
                         assembly + "_gene_annotation.txt")
-                        
+
                     def features_to_string(features, fasta_in, masked=True, strand=True):
-                        """ 
+                        """
                         """
                         sequences = []
                         feature_strand = "."
@@ -272,7 +276,7 @@ def features_to_string(features, fasta_in, masked=True, strand=True):
                             sequences.append(
                                 feature.sequence(
                                     fasta_in, use_strand=strand))
-                        # if the transcript is on the reverse strand, reverse order of exons 
+                        # if the transcript is on the reverse strand, reverse order of exons
                         # before concatenating
                         if feature_strand == "-":
                             sequences = sequences[::-1]
@@ -320,7 +324,7 @@ def features_to_string(features, fasta_in, masked=True, strand=True):
                                 except KeyError:
                                     logging.info("No gene name tag found for %s", gene['gene_id'][0])
                                     gene_name = 'NA'
-                                    
+
                                 transcripts = db.children(gene, featuretype='transcript', order_by='start')
                                 for transcript in transcripts:
                                     # Write entry in the transcripts to genes table
@@ -328,22 +332,22 @@ def features_to_string(features, fasta_in, masked=True, strand=True):
                                         gene=gene['gene_id'][0],
                                         txp=transcript['transcript_id'][0]))
                                     # Construct the transcript sequences and write them to the FASTA
-                                    fa_seq, frac_masked = features_to_string(db.children(transcript, 
-                                                                                   featuretype='exon', 
-                                                                                   order_by='start'), 
-                                                                       reference, 
+                                    fa_seq, frac_masked = features_to_string(db.children(transcript,
+                                                                                   featuretype='exon',
+                                                                                   order_by='start'),
+                                                                       reference,
                                                                        masked=options["masked"])
                                     transcripts_fasta.write('>' + transcript['transcript_id'][0] + '\n')
                                     transcripts_fasta.write(fa_seq + '\n')
-                                    
-                                exons = db.children(gene, featuretype='exon', order_by='start') 
+
+                                exons = db.children(gene, featuretype='exon', order_by='start')
                                 merged_exons = db.merge(exons, merge_criteria=(mc.seqid, mc.feature_type, mc.overlap_any_inclusive))
                                 if options["unprocessed_transcripts"]:
-                                    introns = db.interfeatures(merged_exons, new_featuretype='intron')                                                     
+                                    introns = db.interfeatures(merged_exons, new_featuretype='intron')
                                     transcripts_fasta.write('>' + "intronic_" + gene['gene_id'][0] + '\n')
                                     fa_seq, _ = features_to_string(introns, reference, masked=options["masked"])
                                     transcripts_fasta.write(fa_seq + '\n')
-                                
+
                                 annotation.write(
                                     "{gene}\t{type}\t{name}\t{chrom}\t{start}\t{stop}\t{length}\t{frac_masked}\n".
                                     format(
@@ -355,13 +359,13 @@ def features_to_string(features, fasta_in, masked=True, strand=True):
                                         chrom=gene.chrom,
                                         length=sum(len(exon) for exon in merged_exons),
                                         frac_masked=str(frac_masked)))
-                                        
+
                                 transcripts = db.children(
                                     gene,
                                     featuretype='transcript',
                                     order_by='start')
                                 pbar.update(1)
-                            
+
                     if options["intergenes"]:
                         for seqid in reference.keys():
                             logging.info("Merging overlapping genes on %s", seqid)

From 49b5fb59d5cd145155e83f9f6475aefd8ebcd9ce Mon Sep 17 00:00:00 2001
From: Matt Shirley <matt_d.shirley@novartis.com>
Date: Thu, 5 Jan 2023 18:06:49 +0000
Subject: [PATCH 2/5] Allow a formal GTF dialect

---
 pisces/config.json |  3 +++
 pisces/index.py    | 22 ++++++++++++++++++----
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/pisces/config.json b/pisces/config.json
index 2e49e3f..733c79a 100755
--- a/pisces/config.json
+++ b/pisces/config.json
@@ -11,6 +11,7 @@
                 "options": {
                     "-k": 31,
                     "infer_features": false,
+                    "infer_dialect": true,
                     "gtf_name_tag": "gene_name",
                     "gtf_type_tag": "gene_type"
                 }
@@ -29,6 +30,7 @@
                 "options": {
                     "-k": 31,
                     "infer_features": false,
+                    "infer_dialect": true,
                     "gtf_name_tag": "gene_name",
                     "gtf_type_tag": "gene_type"
                 }
@@ -49,6 +51,7 @@
                 "options": {
                     "-k": 31,
                     "infer_features": false,
+                    "infer_dialect": true,
                     "gtf_name_tag": "gene_name",
                     "gtf_type_tag": "gene_type"
                 }
diff --git a/pisces/index.py b/pisces/index.py
index de27b3a..a57edd0 100755
--- a/pisces/index.py
+++ b/pisces/index.py
@@ -12,6 +12,7 @@ def build_index(args, unknown_args):
     import gffutils.merge_criteria as mc
     import atexit
     import shutil
+    import copy
     from tqdm import tqdm
     from collections import defaultdict
     from pprint import pprint
@@ -159,6 +160,19 @@ def build_index(args, unknown_args):
                         else:
                             reference = Fasta(_fasta_local_path)
 
+                    if not options["infer_dialect"]:
+                        ## set up a GTF dialect for gffutils
+                        from gffutils.constants import dialect as gff_dialect
+                        dialect = copy.copy(gff_dialect)
+                        dialect['field separator'] = '; '
+                        dialect['keyval separator'] = ' '
+                        dialect['fmt'] = 'gtf'
+                        dialect['quoted GFF2 values'] = True
+                        dialect['repeated keys'] = True
+                        dialect['trailing semicolon'] = True
+                    else:
+                        dialect = None
+
                     if gtf.scheme == '':
                         database_filename = gtf.path + '.db'
                         if os.path.exists(database_filename):
@@ -172,7 +186,7 @@ def build_index(args, unknown_args):
                                 db = gffutils.create_db(
                                     gtf.path,
                                     database_filename,
-                                    checklines=1,
+                                    dialect=dialect,
                                     disable_infer_genes=
                                     not options["infer_features"],
                                     disable_infer_transcripts=
@@ -189,7 +203,7 @@ def build_index(args, unknown_args):
                                     db = gffutils.create_db(
                                         gtf.path,
                                         tmp_db,
-                                        checklines=1,
+                                        dialect=dialect,
                                         disable_infer_genes=
                                         not options["infer_features"],
                                         disable_infer_transcripts=
@@ -222,7 +236,7 @@ def build_index(args, unknown_args):
                                             _gtf_local_path.replace(".gz", ""),
                                             _gtf_local_path.replace(
                                                 ".gz", "") + '.db',
-                                            checklines=1,
+                                            dialect=dialect,
                                             disable_infer_genes=
                                             not options["infer_features"],
                                             disable_infer_transcripts=
@@ -234,7 +248,7 @@ def build_index(args, unknown_args):
                                         db = gffutils.create_db(
                                             _gtf_local_path,
                                             _gtf_local_path + '.db',
-                                            checklines=1,
+                                            dialect=dialect,
                                             disable_infer_genes=
                                             not options["infer_features"],
                                             disable_infer_transcripts=

From a13cbedcfb81306aaaf7aafd53773b009e8b8a40 Mon Sep 17 00:00:00 2001
From: Matt Shirley <matt_d.shirley@novartis.com>
Date: Fri, 6 Jan 2023 14:24:26 +0000
Subject: [PATCH 3/5] Pull gene tags from gene record

---
 pisces/index.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/pisces/index.py b/pisces/index.py
index a57edd0..22a63c8 100755
--- a/pisces/index.py
+++ b/pisces/index.py
@@ -162,6 +162,7 @@ def build_index(args, unknown_args):
 
                     if not options["infer_dialect"]:
                         ## set up a GTF dialect for gffutils
+                        ## this dialect works for https://github.com/daler/gffutils/issues/198
                         from gffutils.constants import dialect as gff_dialect
                         dialect = copy.copy(gff_dialect)
                         dialect['field separator'] = '; '
@@ -315,17 +316,12 @@ def features_to_string(features, fasta_in, masked=True, strand=True):
                                 total=db.count_features_of_type('gene'),
                                 unit='gene') as pbar:
                             for gene in db.features_of_type('gene'):
-                                first_exon = next(
-                                    db.children(
-                                        gene,
-                                        featuretype='exon',
-                                        order_by='start'))
                                 try:
                                     if options["gene_type"] == True:
                                         type_tag = "gene_type"
                                     else:
                                         type_tag = options["gene_type"]
-                                    gene_type = first_exon[type_tag][0]
+                                    gene_type = gene[type_tag][0]
                                 except KeyError:
                                     logging.info("No gene type tag found for %s", gene['gene_id'][0])
                                     gene_type = 'NA'
@@ -334,7 +330,7 @@ def features_to_string(features, fasta_in, masked=True, strand=True):
                                         name_tag = "gene_name"
                                     else:
                                         name_tag = options["gene_name"]
-                                    gene_name = first_exon[name_tag][0]
+                                    gene_name = gene[name_tag][0]
                                 except KeyError:
                                     logging.info("No gene name tag found for %s", gene['gene_id'][0])
                                     gene_name = 'NA'

From 5cbf32e809a823b720af5dedf8f3ca3fc17c6e25 Mon Sep 17 00:00:00 2001
From: Matt Shirley <matt_d.shirley@novartis.com>
Date: Fri, 6 Jan 2023 14:54:03 +0000
Subject: [PATCH 4/5] Use correct GTF tags for gene names and biotypes

---
 pisces/index.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/pisces/index.py b/pisces/index.py
index 22a63c8..de325d4 100755
--- a/pisces/index.py
+++ b/pisces/index.py
@@ -317,19 +317,19 @@ def features_to_string(features, fasta_in, masked=True, strand=True):
                                 unit='gene') as pbar:
                             for gene in db.features_of_type('gene'):
                                 try:
-                                    if options["gene_type"] == True:
-                                        type_tag = "gene_type"
+                                    if options["gtf_type_tag"] == True:
+                                        type_tag = "gene_biotype"
                                     else:
-                                        type_tag = options["gene_type"]
+                                        type_tag = options["gtf_type_tag"]
                                     gene_type = gene[type_tag][0]
                                 except KeyError:
                                     logging.info("No gene type tag found for %s", gene['gene_id'][0])
                                     gene_type = 'NA'
                                 try:
-                                    if options["gene_name"] == True:
-                                        name_tag = "gene_name"
+                                    if options["gtf_name_tag"] == True:
+                                        name_tag = "gene_id"
                                     else:
-                                        name_tag = options["gene_name"]
+                                        name_tag = options["gtf_name_tag"]
                                     gene_name = gene[name_tag][0]
                                 except KeyError:
                                     logging.info("No gene name tag found for %s", gene['gene_id'][0])

From a8fd0b04d39844b5ff3bf7770581750de603e0a4 Mon Sep 17 00:00:00 2001
From: Matt Shirley <mdshw5@gmail.com>
Date: Tue, 1 Aug 2023 11:11:13 -0400
Subject: [PATCH 5/5] Update main.yml

---
 .github/workflows/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 22a44ce..29f3ec3 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -30,7 +30,7 @@ jobs:
           python-version: ${{ matrix.python }}
           
       - name: Setup R
-        uses: r-lib/actions/setup-r@master
+        uses: r-lib/actions/setup-r@v2
         with:
             r-version: '3.6.2'