From 01c5f5578248ba13a30b71d4c1866b4481787daa Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Thu, 5 Jan 2023 00:08:26 +0000 Subject: [PATCH 1/5] Use checklines=1 to avoid inconsistent dialect warnings --- pisces/index.py | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/pisces/index.py b/pisces/index.py index bdcf299..de27b3a 100755 --- a/pisces/index.py +++ b/pisces/index.py @@ -61,7 +61,7 @@ def build_index(args, unknown_args): transcripts_fasta_file = os.path.join( index_dir_path, "transcripts", "transcripts.fa") - + with open(transcripts_fasta_file, 'w') as transcripts_fasta: ## all of this URI handling should probably use an existing library like ## https://github.com/intake/filesystem_spec @@ -104,12 +104,12 @@ def build_index(args, unknown_args): fasta.write(str(twobit[chrom]) + '\n') reference = Fasta( _fasta_local_path.replace("2bit", "fa")) - + with open(_fasta_local_path) as extra: logging.info("Adding entries from %s", fasta) for line in extra: transcripts_fasta.write(line) - + for gtf_loc, fasta_loc in zip(dataset["gtfs"], dataset["fastas"]): gtf = urlparse(gtf_loc) @@ -172,6 +172,7 @@ def build_index(args, unknown_args): db = gffutils.create_db( gtf.path, database_filename, + checklines=1, disable_infer_genes= not options["infer_features"], disable_infer_transcripts= @@ -188,6 +189,7 @@ def build_index(args, unknown_args): db = gffutils.create_db( gtf.path, tmp_db, + checklines=1, disable_infer_genes= not options["infer_features"], disable_infer_transcripts= @@ -220,6 +222,7 @@ def build_index(args, unknown_args): _gtf_local_path.replace(".gz", ""), _gtf_local_path.replace( ".gz", "") + '.db', + checklines=1, disable_infer_genes= not options["infer_features"], disable_infer_transcripts= @@ -231,6 +234,7 @@ def build_index(args, unknown_args): db = gffutils.create_db( _gtf_local_path, _gtf_local_path + '.db', + checklines=1, disable_infer_genes= not options["infer_features"], disable_infer_transcripts= @@ -261,9 +265,9 @@ def build_index(args, unknown_args): gene_annotation = os.path.join( index_dir_path, assembly + "_gene_annotation.txt") - + def features_to_string(features, fasta_in, masked=True, strand=True): - """ + """ """ sequences = [] feature_strand = "." @@ -272,7 +276,7 @@ def features_to_string(features, fasta_in, masked=True, strand=True): sequences.append( feature.sequence( fasta_in, use_strand=strand)) - # if the transcript is on the reverse strand, reverse order of exons + # if the transcript is on the reverse strand, reverse order of exons # before concatenating if feature_strand == "-": sequences = sequences[::-1] @@ -320,7 +324,7 @@ def features_to_string(features, fasta_in, masked=True, strand=True): except KeyError: logging.info("No gene name tag found for %s", gene['gene_id'][0]) gene_name = 'NA' - + transcripts = db.children(gene, featuretype='transcript', order_by='start') for transcript in transcripts: # Write entry in the transcripts to genes table @@ -328,22 +332,22 @@ def features_to_string(features, fasta_in, masked=True, strand=True): gene=gene['gene_id'][0], txp=transcript['transcript_id'][0])) # Construct the transcript sequences and write them to the FASTA - fa_seq, frac_masked = features_to_string(db.children(transcript, - featuretype='exon', - order_by='start'), - reference, + fa_seq, frac_masked = features_to_string(db.children(transcript, + featuretype='exon', + order_by='start'), + reference, masked=options["masked"]) transcripts_fasta.write('>' + transcript['transcript_id'][0] + '\n') transcripts_fasta.write(fa_seq + '\n') - - exons = db.children(gene, featuretype='exon', order_by='start') + + exons = db.children(gene, featuretype='exon', order_by='start') merged_exons = db.merge(exons, merge_criteria=(mc.seqid, mc.feature_type, mc.overlap_any_inclusive)) if options["unprocessed_transcripts"]: - introns = db.interfeatures(merged_exons, new_featuretype='intron') + introns = db.interfeatures(merged_exons, new_featuretype='intron') transcripts_fasta.write('>' + "intronic_" + gene['gene_id'][0] + '\n') fa_seq, _ = features_to_string(introns, reference, masked=options["masked"]) transcripts_fasta.write(fa_seq + '\n') - + annotation.write( "{gene}\t{type}\t{name}\t{chrom}\t{start}\t{stop}\t{length}\t{frac_masked}\n". format( @@ -355,13 +359,13 @@ def features_to_string(features, fasta_in, masked=True, strand=True): chrom=gene.chrom, length=sum(len(exon) for exon in merged_exons), frac_masked=str(frac_masked))) - + transcripts = db.children( gene, featuretype='transcript', order_by='start') pbar.update(1) - + if options["intergenes"]: for seqid in reference.keys(): logging.info("Merging overlapping genes on %s", seqid) From 49b5fb59d5cd145155e83f9f6475aefd8ebcd9ce Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Thu, 5 Jan 2023 18:06:49 +0000 Subject: [PATCH 2/5] Allow a formal GTF dialect --- pisces/config.json | 3 +++ pisces/index.py | 22 ++++++++++++++++++---- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/pisces/config.json b/pisces/config.json index 2e49e3f..733c79a 100755 --- a/pisces/config.json +++ b/pisces/config.json @@ -11,6 +11,7 @@ "options": { "-k": 31, "infer_features": false, + "infer_dialect": true, "gtf_name_tag": "gene_name", "gtf_type_tag": "gene_type" } @@ -29,6 +30,7 @@ "options": { "-k": 31, "infer_features": false, + "infer_dialect": true, "gtf_name_tag": "gene_name", "gtf_type_tag": "gene_type" } @@ -49,6 +51,7 @@ "options": { "-k": 31, "infer_features": false, + "infer_dialect": true, "gtf_name_tag": "gene_name", "gtf_type_tag": "gene_type" } diff --git a/pisces/index.py b/pisces/index.py index de27b3a..a57edd0 100755 --- a/pisces/index.py +++ b/pisces/index.py @@ -12,6 +12,7 @@ def build_index(args, unknown_args): import gffutils.merge_criteria as mc import atexit import shutil + import copy from tqdm import tqdm from collections import defaultdict from pprint import pprint @@ -159,6 +160,19 @@ def build_index(args, unknown_args): else: reference = Fasta(_fasta_local_path) + if not options["infer_dialect"]: + ## set up a GTF dialect for gffutils + from gffutils.constants import dialect as gff_dialect + dialect = copy.copy(gff_dialect) + dialect['field separator'] = '; ' + dialect['keyval separator'] = ' ' + dialect['fmt'] = 'gtf' + dialect['quoted GFF2 values'] = True + dialect['repeated keys'] = True + dialect['trailing semicolon'] = True + else: + dialect = None + if gtf.scheme == '': database_filename = gtf.path + '.db' if os.path.exists(database_filename): @@ -172,7 +186,7 @@ def build_index(args, unknown_args): db = gffutils.create_db( gtf.path, database_filename, - checklines=1, + dialect=dialect, disable_infer_genes= not options["infer_features"], disable_infer_transcripts= @@ -189,7 +203,7 @@ def build_index(args, unknown_args): db = gffutils.create_db( gtf.path, tmp_db, - checklines=1, + dialect=dialect, disable_infer_genes= not options["infer_features"], disable_infer_transcripts= @@ -222,7 +236,7 @@ def build_index(args, unknown_args): _gtf_local_path.replace(".gz", ""), _gtf_local_path.replace( ".gz", "") + '.db', - checklines=1, + dialect=dialect, disable_infer_genes= not options["infer_features"], disable_infer_transcripts= @@ -234,7 +248,7 @@ def build_index(args, unknown_args): db = gffutils.create_db( _gtf_local_path, _gtf_local_path + '.db', - checklines=1, + dialect=dialect, disable_infer_genes= not options["infer_features"], disable_infer_transcripts= From a13cbedcfb81306aaaf7aafd53773b009e8b8a40 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Fri, 6 Jan 2023 14:24:26 +0000 Subject: [PATCH 3/5] Pull gene tags from gene record --- pisces/index.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/pisces/index.py b/pisces/index.py index a57edd0..22a63c8 100755 --- a/pisces/index.py +++ b/pisces/index.py @@ -162,6 +162,7 @@ def build_index(args, unknown_args): if not options["infer_dialect"]: ## set up a GTF dialect for gffutils + ## this dialect works for https://github.com/daler/gffutils/issues/198 from gffutils.constants import dialect as gff_dialect dialect = copy.copy(gff_dialect) dialect['field separator'] = '; ' @@ -315,17 +316,12 @@ def features_to_string(features, fasta_in, masked=True, strand=True): total=db.count_features_of_type('gene'), unit='gene') as pbar: for gene in db.features_of_type('gene'): - first_exon = next( - db.children( - gene, - featuretype='exon', - order_by='start')) try: if options["gene_type"] == True: type_tag = "gene_type" else: type_tag = options["gene_type"] - gene_type = first_exon[type_tag][0] + gene_type = gene[type_tag][0] except KeyError: logging.info("No gene type tag found for %s", gene['gene_id'][0]) gene_type = 'NA' @@ -334,7 +330,7 @@ def features_to_string(features, fasta_in, masked=True, strand=True): name_tag = "gene_name" else: name_tag = options["gene_name"] - gene_name = first_exon[name_tag][0] + gene_name = gene[name_tag][0] except KeyError: logging.info("No gene name tag found for %s", gene['gene_id'][0]) gene_name = 'NA' From 5cbf32e809a823b720af5dedf8f3ca3fc17c6e25 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Fri, 6 Jan 2023 14:54:03 +0000 Subject: [PATCH 4/5] Use correct GTF tags for gene names and biotypes --- pisces/index.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pisces/index.py b/pisces/index.py index 22a63c8..de325d4 100755 --- a/pisces/index.py +++ b/pisces/index.py @@ -317,19 +317,19 @@ def features_to_string(features, fasta_in, masked=True, strand=True): unit='gene') as pbar: for gene in db.features_of_type('gene'): try: - if options["gene_type"] == True: - type_tag = "gene_type" + if options["gtf_type_tag"] == True: + type_tag = "gene_biotype" else: - type_tag = options["gene_type"] + type_tag = options["gtf_type_tag"] gene_type = gene[type_tag][0] except KeyError: logging.info("No gene type tag found for %s", gene['gene_id'][0]) gene_type = 'NA' try: - if options["gene_name"] == True: - name_tag = "gene_name" + if options["gtf_name_tag"] == True: + name_tag = "gene_id" else: - name_tag = options["gene_name"] + name_tag = options["gtf_name_tag"] gene_name = gene[name_tag][0] except KeyError: logging.info("No gene name tag found for %s", gene['gene_id'][0]) From a8fd0b04d39844b5ff3bf7770581750de603e0a4 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Tue, 1 Aug 2023 11:11:13 -0400 Subject: [PATCH 5/5] Update main.yml --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 22a44ce..29f3ec3 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -30,7 +30,7 @@ jobs: python-version: ${{ matrix.python }} - name: Setup R - uses: r-lib/actions/setup-r@master + uses: r-lib/actions/setup-r@v2 with: r-version: '3.6.2'