Skip to content

Commit

Permalink
added blat command for similar functionality as blast
Browse files Browse the repository at this point in the history
  • Loading branch information
RAHenriksen committed Dec 27, 2024
1 parent 0ff2d3e commit f254f5d
Showing 1 changed file with 244 additions and 0 deletions.
244 changes: 244 additions & 0 deletions bifrost_chewbbaca/rule__blat_genecall.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,244 @@
import os
import subprocess
import traceback
from Bio import SeqIO
from Bio.Seq import Seq
import pandas as pd
from bifrostlib.datahandling import Component, Sample
from bifrostlib.datahandling import SampleComponentReference
from bifrostlib.datahandling import SampleComponent
from pathlib import Path

def parse_blat_output(output_tsv, assembly_sequences):
"""
processes the output after running the blast command
"""

# Dictionary to store the best hit per (locus, contig)
best_hits = {}
# List to store all full-coverage, 100%-identity hits
full_coverage_hits = []

# Open and parse the BLAT output TSV file
with open(output_tsv, 'r') as file:
reader = csv.reader(file, delimiter='\t')
for cols in reader:
record = {
'qaccver': cols[0],
'saccver': cols[1],
'slen': int(cols[4]),
'pident': float(cols[3]),
'length': int(cols[4]),
'mismatch': int(cols[5]),
'gapopen': int(cols[6]),
'qstart': int(cols[7]),
'qend': int(cols[8]),
'sstart': int(cols[9]),
'send': int(cols[10]),
'evalue': float(cols[11]),
'bitscore': float(cols[12]),
}

# Extract locus from the subject accession (saccver)
locus = "_".join(record['saccver'].split("_")[:-1])
key = (locus, record['qaccver']) # Locus-contig combination

# On-the-fly sorting: Keep the best hit per locus-contig pair
if key in best_hits:
# Compare current hit with the stored best hit
best_hit = best_hits[key]
if (
record['pident'] > best_hit['pident'] or # Higher percent identity
(record['pident'] == best_hit['pident'] and record['bitscore'] > best_hit['bitscore']) or # Higher bitscore
(record['pident'] == best_hit['pident'] and record['bitscore'] == best_hit['bitscore'] and record['evalue'] < best_hit['evalue']) # Lower e-value
):
best_hits[key] = record
else:
# Store this hit as the best for the locus-contig combination
best_hits[key] = record

# Collect full-coverage, 100%-identity hits
if record['length'] == record['slen'] and record['pident'] == 100.0:
full_coverage_hits.append(record)

# Collect the final list of hits
final_hits = list(best_hits.values())

# Include all full-coverage hits not already in the final list
for hit in full_coverage_hits:
if hit not in final_hits:
final_hits.append(hit)

# Process the final hits into allele format
alleles = []
for hit in final_hits:
if hit['length'] / hit['slen'] >= 0.6:
alleles.append(process_hit(hit, assembly_sequences))

return alleles

def run_blat_and_parse(query_fa, db_fa, output_tsv, assembly_sequences):
"""
Runs BLAT and processes the outputs.
"""
# Define the BLAT command
blat_cmd = [
"blat",
db_fa,
query_fa,
"-minIdentity=90",
"-t=dna",
"-q=dna",
"-out=blast8",
output_tsv
]

# Run BLAT
try:
subprocess.run(blat_cmd, check=True, text=True)
except subprocess.CalledProcessError as e:
raise RuntimeError(f"BLAT command failed with error: {e}")

# Parse the BLAT output
return parse_blat_output(output_tsv, assembly_sequences)


def rule__blat_genecall(input: object, output: object, params: object, log: object) -> None:
try:
samplecomponent_ref_json = params.samplecomponent_ref_json
samplecomponent_ref = SampleComponentReference(value=samplecomponent_ref_json)
samplecomponent = SampleComponent.load(samplecomponent_ref)
sample = Sample.load(samplecomponent.sample)
component = Component.load(samplecomponent.component)
sample_name = sample["name"]
# Variables being used
# resources_dir = component['resources']['schemes']
species_detection = sample.get_category("species_detection")
detected_species = species_detection["summary"]["detected_species"]

os.makedirs(output.gene_call_results, exist_ok=True)
# process_single_assembly(
# assembly_file=input.genome,
# db=Path(params.chewbbaca_blastdb)/component["options"]["chewbbaca_species_mapping"]['blastdb'][detected_species],
# output_dir=output.gene_call_results, combined_dir, assemblies_dir):
process_single_assembly(
assembly_path=input.genome,
db=params.blat_db_fasta #Path(params.chewbbaca_blastdb)/component["options"]["chewbbaca_species_mapping"]['blastdb'][detected_species],
output_file=output.gene_calls,log=log)

# process_loci_parallel(
# component["options"]["chewbbaca_species_mapping"]['blastdb'][detected_species],
# input.genome,
# output.gene_call_results)
with open(output.gene_call_done, "w", encoding="utf-8") as fh:
fh.write("")
except Exception:
with open(log.err_file, "w+", encoding="utf-8") as fh:
fh.write(traceback.format_exc())
raise


def process_hit(hit, assembly_sequences):
"""
Adjusts the start and end positions of a hit and ensures it fits within contig limits.
Returns a tuple with adjusted hit details.
"""
alleles=[]
qaccver, saccver, slen, qlen, sstart, send, qstart, qend = (
hit['qaccver'], hit['saccver'], int(hit['slen']), int(hit['length']),
int(hit['sstart']), int(hit['send']), int(hit['qstart']), int(hit['qend'])
)

# Ensure that the smallest position is taken as start and the largest as end
subject_start = min(sstart, send)
subject_end = max(sstart, send)

# Adjust start and end points to ensure they're within contig limits
contig_length = len(assembly_sequences[qaccver])

if subject_start != 1:
qstart += subject_start - 1 # Adjust qstart if the alignment doesn't start at position 1
if subject_end != slen:
qend += slen - subject_end # Adjust qend if the alignment doesn't end at the full length
qend = min(qend, contig_length)

# Convert qstart to 0-based for BED format and adjust qstart/qend if necessary
qstart -= 1
qstart = min(qstart, qend)
qend = max(qstart, qend)

return qaccver, qstart, qend, saccver, slen, qlen



def reverse_complement(seq):
"""
Returns the reverse complement of a sequence.
"""
return str(Seq(seq).reverse_complement())


def reverse_complement_check(seq):
"""
Checks if the sequence should be reverse complemented based on its starting pattern.
"""
return seq.lower().startswith(('tta', 'tca', 'cta'))


def extract_subsequences(fasta_sequences, alleles):
"""
Extracts subsequences from the given FASTA sequences based on the alleles' positions.
Returns a list of tuples (header, subsequence).
"""
extracted_sequences = []
for allele in alleles:
seq_id, qstart, qend, saccver = allele[0], allele[1], allele[2], allele[3]

if seq_id in fasta_sequences:
subseq = fasta_sequences[seq_id][qstart:qend]

# Check if the subsequence needs to be reverse complemented
if reverse_complement_check(subseq):
subseq = reverse_complement(subseq) # Reverse complement if necessary

header = f">{seq_id}_{saccver}_{qstart + 1}_{qend}"
extracted_sequences.append((header, subseq))

return extracted_sequences


def read_fasta(file_path):
"""
Reads a FASTA file and returns a dictionary of sequences.
"""
return {record.id: str(record.seq) for record in SeqIO.parse(file_path, "fasta")}


def process_single_assembly(assembly_path, db_fasta, output_file, log):
"""
Processes a single assembly against the specified database and writes the combined alleles to a single file.
"""
assembly_name = os.path.basename(assembly_path).replace('.fasta', '').replace('.fa', '')

# Read the assembly sequences into memory once per assembly
fasta_sequences = read_fasta(assembly_path)

# Run BLAST and parse the output
output_tsv = os.path.join(os.path.dirname(output_file), f"blat_{assembly_name}.out")
alleles = run_blat_and_parse(assembly_path, db_fasta, output_tsv, fasta_sequences)

#alleles = run_blast_and_parse(assembly_path, db, fasta_sequences, log=log)

# blast_output = os.path.join(output_dir, f'blast_{assembly_name}.out')
# run_blastn(assembly_path, db, blast_output)

# # Parse and extract alleles based on the BLAST results
# alleles = parse_blast_output(blast_output, fasta_sequences)
extracted_sequences = extract_subsequences(fasta_sequences, alleles)

# Write extracted sequences to the combined FASTA file
with open(output_file, 'w') as combined_file:
for header, subseq in extracted_sequences:
combined_file.write(f"{header}\n{subseq}\n")

rule__blat_genecall(snakemake.input, snakemake.output, snakemake.params, snakemake.log)

0 comments on commit f254f5d

Please sign in to comment.