From e0ae571115adbe8affb7ca379afc277a6f646873 Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld Date: Wed, 15 Jan 2025 12:23:44 -0500 Subject: [PATCH 01/20] Reorganize new models --- src/fusor/models.py | 64 +++++++++++++++++++++++++++++++++++++++++++- tests/test_models.py | 35 ++++++++++++++++++++++++ 2 files changed, 98 insertions(+), 1 deletion(-) diff --git a/src/fusor/models.py b/src/fusor/models.py index 3a21f4d..ff60c21 100644 --- a/src/fusor/models.py +++ b/src/fusor/models.py @@ -39,8 +39,10 @@ class FUSORTypes(str, Enum): MULTIPLE_POSSIBLE_GENES_ELEMENT = "MultiplePossibleGenesElement" BREAKPOINT_COVERAGE = "BreakpointCoverage" CONTIG_SEQUENCE = "ContigSequence" + ANCHORED_READS = "AnchoredReads" SPLIT_READS = "SplitReads" SPANNING_READS = "SpanningReads" + READ_DATA = "ReadData" REGULATORY_ELEMENT = "RegulatoryElement" CATEGORICAL_FUSION = "CategoricalFusion" ASSAYED_FUSION = "AssayedFusion" @@ -154,6 +156,18 @@ class ContigSequence(BaseStructuralElement): ) +class AnchoredReads(BaseStructuralElement): + """Define AnchoredReads class + + This class can be used to report the number of reads that span the + fusion junction. This is used at the TranscriptSegment level, as it + indicates the transcript where the longer segment of the read is found + """ + + type: Literal[FUSORTypes.ANCHORED_READS] = FUSORTypes.ANCHORED_READS + reads: int = Field(ge=0) + + class SplitReads(BaseStructuralElement): """Define SplitReads class. @@ -184,6 +198,28 @@ class SpanningReads(BaseStructuralElement): ) +class ReadData(BaseStructuralElement): + """Define ReadData class. + + This class is used at the AssayedFusion level when a fusion caller reports + metadata describing sequencing reads for the fusion event + """ + + type: Literal[FUSORTypes.READ_DATA] = FUSORTypes.READ_DATA + split: SplitReads | None = None + spanning: SpanningReads | None = None + + model_config = ConfigDict( + json_schema_extra={ + "example": { + "type": "ReadData", + "split": {"type": "SplitReads", "splitReads": 100}, + "spanning": {"type": "SpanningReads", "spanningReads": 80}, + } + } + ) + + class TranscriptSegmentElement(BaseStructuralElement): """Define TranscriptSegment class""" @@ -199,6 +235,7 @@ class TranscriptSegmentElement(BaseStructuralElement): elementGenomicStart: SequenceLocation | None = None elementGenomicEnd: SequenceLocation | None = None coverage: BreakpointCoverage | None = None + anchoredReads: AnchoredReads | None = None @model_validator(mode="before") def check_exons(cls, values): @@ -264,6 +301,14 @@ def check_exons(cls, values): }, "start": 154170399, }, + "coverage": { + "type": "BreakpointCoverage", + "fragmentCoverage": 185, + }, + "anchoredReads": { + "type": "AnchoredReads", + "reads": 100, + }, } }, ) @@ -645,7 +690,8 @@ class Assay(BaseModelForbidExtra): | TemplatedSequenceElement | LinkerElement | UnknownGeneElement - | ContigSequence, + | ContigSequence + | ReadData, Field(discriminator="type"), ] @@ -695,6 +741,7 @@ class AssayedFusion(AbstractFusion): causativeEvent: CausativeEvent | None = None assay: Assay | None = None contig: ContigSequence | None = None + readData: ReadData | None = None model_config = ConfigDict( json_schema_extra={ @@ -712,6 +759,21 @@ class AssayedFusion(AbstractFusion): "assayName": "fluorescence in-situ hybridization assay", "fusionDetection": "inferred", }, + "contig": { + "type": "ContigSequence", + "contig": "GTACTACTGATCTAGCATCTAGTA", + }, + "readData": { + "type": "ReadData", + "split": { + "type": "SplitReads", + "splitReads": 100, + }, + "spanning": { + "type": "SpanningReads", + "spanningReads": 80, + }, + }, "structure": [ { "type": "GeneElement", diff --git a/tests/test_models.py b/tests/test_models.py index 72879af..4159383 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -6,6 +6,7 @@ from fusor.models import ( AbstractFusion, + AnchoredReads, Assay, AssayedFusion, BreakpointCoverage, @@ -17,6 +18,7 @@ GeneElement, LinkerElement, MultiplePossibleGenesElement, + ReadData, RegulatoryElement, SpanningReads, SplitReads, @@ -178,6 +180,8 @@ def transcript_segments(sequence_locations, gene_examples): "gene": gene_examples[0], "elementGenomicStart": sequence_locations[2], "elementGenomicEnd": sequence_locations[3], + "coverage": BreakpointCoverage(fragmentCoverage=100), + "anchoredReads": AnchoredReads(reads=85), }, { "type": "TranscriptSegmentElement", @@ -379,6 +383,8 @@ def test_transcript_segment_element(transcript_segments): assert test_region_start.type == "SequenceLocation" test_region_end = test_element.elementGenomicEnd assert test_region_end.type == "SequenceLocation" + assert test_element.coverage.fragmentCoverage == 100 + assert test_element.anchoredReads.reads == 85 test_element = TranscriptSegmentElement(**transcript_segments[3]) assert test_element.transcript == "refseq:NM_938439.4" @@ -386,6 +392,8 @@ def test_transcript_segment_element(transcript_segments): assert test_element.exonStartOffset == 0 assert test_element.exonEnd is None assert test_element.exonEndOffset is None + assert test_element.coverage is None + assert test_element.anchoredReads is None # check CURIE requirement with pytest.raises(ValidationError) as exc_info: @@ -640,6 +648,18 @@ def test_contig(): check_validation_error(exc_info, msg) +def test_anchored_reads(): + """Test that AnchoredReads class initializes correctly""" + test_anchored_reads = AnchoredReads(reads=100) + assert test_anchored_reads.reads == 100 + + # test enum validation + with pytest.raises(ValidationError) as exc_info: + assert AnchoredReads(type="anchoredreads") + msg = "Input should be " + check_validation_error(exc_info, msg) + + def test_split_reads(): """Test that SplitReads class initializes correctly""" test_split_reads = SplitReads(splitReads=97) @@ -664,6 +684,21 @@ def test_spanning_reads(): check_validation_error(exc_info, msg) +def test_read_data(): + """Test that ReadData class initializes correctly""" + test_read_data = ReadData( + split=SplitReads(splitReads=100), spanning=SpanningReads(spanningReads=90) + ) + assert test_read_data.split.splitReads == 100 + assert test_read_data.spanning.spanningReads == 90 + + # test enum validation + with pytest.raises(ValidationError) as exc_info: + assert ReadData(type="readata") + msg = "Input should be " + check_validation_error(exc_info, msg) + + def test_event(): """Test Event object initializes correctly""" rearrangement = EventType.REARRANGEMENT From 23a164447f681a231861c1033bc88e0f36a5f4fc Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld Date: Wed, 15 Jan 2025 14:45:51 -0500 Subject: [PATCH 02/20] Incorporate metadata classes into translators --- src/fusor/fusor.py | 8 + src/fusor/models.py | 4 +- src/fusor/translator.py | 120 +++++++++++- tests/test_translators.py | 379 +++++++++++++++++++++++++++++--------- 4 files changed, 417 insertions(+), 94 deletions(-) diff --git a/src/fusor/fusor.py b/src/fusor/fusor.py index cbf9319..0d631b5 100644 --- a/src/fusor/fusor.py +++ b/src/fusor/fusor.py @@ -23,10 +23,12 @@ from fusor.exceptions import FUSORParametersException, IDTranslationException from fusor.models import ( + AnchoredReads, Assay, AssayedFusion, AssayedFusionElement, BaseStructuralElement, + BreakpointCoverage, CategoricalFusion, CategoricalFusionElement, CausativeEvent, @@ -219,6 +221,8 @@ async def transcript_segment_element( tx_to_genomic_coords: bool = True, use_minimal_gene: bool = True, seq_id_target_namespace: str | None = None, + coverage: BreakpointCoverage | None = None, + reads: AnchoredReads | None = None, **kwargs, ) -> tuple[TranscriptSegmentElement | None, list[str] | None]: """Create transcript segment element. @@ -230,6 +234,8 @@ async def transcript_segment_element( gene-normalizer's entire gene object will be used :param seq_id_target_namespace: If want to use digest for ``sequence_id``, set this to the namespace you want the digest for. Otherwise, leave as ``None``. + :param coverage: The read coverage located near the specified breakpoint + :param reads: The read data for the specified breakpoint :param kwargs: If ``tx_to_genomic_coords``, possible key word arguments: @@ -307,6 +313,8 @@ async def transcript_segment_element( gene=normalized_gene_response[0], elementGenomicStart=genomic_start_location, elementGenomicEnd=genomic_end_location, + coverage=coverage if coverage else None, + anchoredReads=reads if reads else None, ), None, ) diff --git a/src/fusor/models.py b/src/fusor/models.py index ff60c21..9eed493 100644 --- a/src/fusor/models.py +++ b/src/fusor/models.py @@ -146,7 +146,9 @@ class ContigSequence(BaseStructuralElement): type: Literal[FUSORTypes.CONTIG_SEQUENCE] = FUSORTypes.CONTIG_SEQUENCE contig: Annotated[ str, - StringConstraints(strip_whitespace=True, to_upper=True, pattern=r"^[ACGT]+$"), + StringConstraints( + strip_whitespace=True, to_upper=True, pattern=r"^(?:[^A-Za-z]|[ACTGactg])*$" + ), ] model_config = ConfigDict( diff --git a/src/fusor/translator.py b/src/fusor/translator.py index 1698206..60284d3 100644 --- a/src/fusor/translator.py +++ b/src/fusor/translator.py @@ -10,11 +10,17 @@ from fusor.fusor import FUSOR from fusor.models import ( + AnchoredReads, Assay, AssayedFusion, + BreakpointCoverage, CausativeEvent, + ContigSequence, EventType, GeneElement, + ReadData, + SpanningReads, + SplitReads, TranscriptSegmentElement, ) @@ -56,6 +62,8 @@ def _format_fusion( ce: CausativeEvent | None = None, rf: bool | None = None, assay: Assay | None = None, + contig: ContigSequence | None = None, + reads: ReadData | None = None, ) -> AssayedFusion: """Format classes to create AssayedFusion objects @@ -66,6 +74,8 @@ def _format_fusion( :param ce: CausativeEvent :param rf: A boolean indicating if the reading frame is preserved :param assay: Assay + :param contig: The contig sequence + :param reads: The read data :return AssayedFusion object """ params = {} @@ -84,6 +94,10 @@ def _format_fusion( params["readingFramePreserved"] = rf if assay: params["assay"] = assay + if contig: + params["contig"] = contig + if reads: + params["readData"] = reads return AssayedFusion(**params) def _get_causative_event( @@ -186,6 +200,8 @@ async def from_jaffa( rearrangement: bool, classification: str, inframe: bool, + spanning_reads: int, + spanning_pairs: int, coordinate_type: CoordinateType, rb: Assembly, ) -> AssayedFusion | None: @@ -199,6 +215,12 @@ async def from_jaffa( :param rearrangement: A boolean indicating if a rearrangement occured :param classification: The classification associated with the called fusion :param inframe: A boolean indicating if the fusion occurred in-frame + :param spanning_reads: The number of deteced reads that span the junction + bewtween the two transcript. Although described as spanning reads, this + aligns with our defintion of split reads (i.e. reads that have sequence + belonging to the fusion partners) + :param spanning_pairs: The number of detected reads that align entirely on + either side of the breakpoint :param coordinate_type: If the coordinate is inter-residue or residue :param rb: The reference build used to call the fusion :return: An AssayedFusion object, if construction is successful @@ -238,8 +260,19 @@ async def from_jaffa( else: ce = None + read_data = ReadData( + split=SplitReads(splitReads=spanning_reads), + spanning=SpanningReads(spanningReads=spanning_pairs), + ) + return self._format_fusion( - gene_5prime_element, gene_3prime_element, tr_5prime, tr_3prime, ce, inframe + gene_5prime_element, + gene_3prime_element, + tr_5prime, + tr_3prime, + ce, + inframe, + reads=read_data, ) async def from_star_fusion( @@ -249,6 +282,8 @@ async def from_star_fusion( left_breakpoint: str, right_breakpoint: str, annots: str, + junction_read_count: int, + spanning_frag_count: int, coordinate_type: CoordinateType, rb: Assembly, ) -> AssayedFusion: @@ -259,6 +294,11 @@ async def from_star_fusion( :param left_breakpoint: The gene indicated in the LeftBreakpoint column :param right_breakpoint: The gene indicated in the RightBreakpoint column :param annots: The annotations associated with the fusion + :param junction_read_count: The number of RNA-seq fragments that split the + junction between the two transcript segments (from STAR-Fusion documentation) + :param spanning_frag_count: The number of RNA-seq fragments that encompass the + fusion junction such that one read of the pair aligns to a different gene + than the other paired-end read of that fragment (from STAR-Fusion documentation) :param coordinate_type: If the coordinate is inter-residue or residue :param rb: The reference build used to call the fusion :return: An AssayedFusion object, if construction is successful @@ -295,8 +335,18 @@ async def from_star_fusion( ) ce = self._get_causative_event(five_prime[0], three_prime[0], ",".join(annots)) + read_data = ReadData( + split=SplitReads(splitReads=junction_read_count), + spanning=SpanningReads(spanningReads=spanning_frag_count), + ) + return self._format_fusion( - gene_5prime_element, gene_3prime_element, tr_5prime, tr_3prime, ce + gene_5prime_element, + gene_3prime_element, + tr_5prime, + tr_3prime, + ce, + reads=read_data, ) async def from_fusion_catcher( @@ -306,6 +356,9 @@ async def from_fusion_catcher( five_prime_fusion_point: str, three_prime_fusion_point: str, predicted_effect: str, + spanning_unique_reads: int, + spanning_reads: int, + fusion_sequence: str, coordinate_type: CoordinateType, rb: Assembly, ) -> AssayedFusion: @@ -319,6 +372,10 @@ async def from_fusion_catcher( fusion junction. This coordinate is 1-based :param predicted_effect: The predicted effect of the fusion event, created using annotation from the Ensembl database + :param spanning_unique_reads: The number of unique reads that map on the fusion + junction + :param spanning_reads: The number of paired reads that support the fusion + :param fusion_sequence: The inferred sequence around the fusion junction :param coordinate_type: If the coordinate is inter-residue or residue :param rb: The reference build used to call the fusion :return: An AssayedFusion object, if construction is successful @@ -356,8 +413,20 @@ async def from_fusion_catcher( ) ce = self._get_causative_event(five_prime[0], three_prime[0], predicted_effect) + read_data = ReadData( + split=SplitReads(splitReads=spanning_unique_reads), + spanning=SpanningReads(spanningReads=spanning_reads), + ) + contig = ContigSequence(contig=fusion_sequence) + return self._format_fusion( - gene_5prime_element, gene_3prime_element, tr_5prime, tr_3prime, ce + gene_5prime_element, + gene_3prime_element, + tr_5prime, + tr_3prime, + ce, + contig=contig, + reads=read_data, ) async def from_fusion_map( @@ -431,6 +500,12 @@ async def from_arriba( direction1: str, direction2: str, rf: str, + split_reads1: int, + split_reads2: int, + discordant_mates: int, + coverage1: int, + coverage2: int, + fusion_transcript: str, coordinate_type: CoordinateType, rb: Assembly, ) -> AssayedFusion: @@ -449,6 +524,12 @@ async def from_arriba( :param direction2: A description that indicates if the transcript segment starts or ends at breakpoint2 :param rf: A description if the reading frame is preserved for the fusion + :param split_reads1: Number of supporting split fragments with anchor in gene1 + :param split_reads2: Number of supporting split fragments with anchor in gene2 + :param discordant_mates: Number of discordant mates supporting the fusion + :param coverage1: Number of fragments retained near breakpoint1 + :param coverage2: Number of fragments retained near breakpoint2 + :param fusion_transcript: The assembled fusion transcript :param coordinate_type: If the coordinate is inter-residue or residue :param rb: The reference build used to call the fusion :return: An AssayedFusion object, if construction is successful @@ -484,6 +565,8 @@ async def from_arriba( seg_start_genomic=int(breakpoint1[1]) if gene1_seg_start else None, seg_end_genomic=int(breakpoint1[1]) if not gene1_seg_start else None, gene=gene_5prime, + coverage=BreakpointCoverage(fragmentCoverage=coverage1), + reads=AnchoredReads(reads=split_reads1), coordinate_type=coordinate_type, starting_assembly=rb, ) @@ -494,6 +577,8 @@ async def from_arriba( seg_start_genomic=int(breakpoint2[1]) if gene2_seg_start else None, seg_end_genomic=int(breakpoint2[1]) if not gene2_seg_start else None, gene=gene_3prime, + coverage=BreakpointCoverage(fragmentCoverage=coverage2), + reads=AnchoredReads(reads=split_reads2), coordinate_type=coordinate_type, starting_assembly=rb, ) @@ -510,8 +595,18 @@ async def from_arriba( ) ) rf = bool(rf == "in-frame") if rf != "." else None + read_data = ReadData(spanning=SpanningReads(spanningReads=discordant_mates)) + contig = ContigSequence(contig=fusion_transcript) + return self._format_fusion( - gene_5prime_element, gene_3prime_element, tr_5prime, tr_3prime, ce, rf + gene_5prime_element, + gene_3prime_element, + tr_5prime, + tr_3prime, + ce, + rf, + contig=contig, + reads=read_data, ) async def from_cicero( @@ -524,6 +619,11 @@ async def from_cicero( pos_3prime: int, sv_ort: str, event_type: str, + reads_a: int, + reads_b: int, + coverage_a: int, + coverage_b: int, + contig: str, coordinate_type: CoordinateType, rb: Assembly, ) -> AssayedFusion | str: @@ -538,6 +638,11 @@ async def from_cicero( :param sv_ort: Whether the mapping orientation of assembled contig (driven by structural variation) has confident biological meaning :param event_type: The structural variation event that created the called fusion + :param readsA: The number of reads that support the breakpoint for the 5' partner + :param readsB: The number of reads that support the breakpoint for the 3' partner + :param coverageA: The fragment coverage at the 5' breakpoint + :param coverageB: The fragment coverage at the 3' breakpoint + :param contig: The assembled contig sequence for the fusion :param coordinate_type: If the coordinate is inter-residue or residue :param rb: The reference build used to call the fusion :return: An AssayedFusion object, if construction is successful @@ -571,6 +676,8 @@ async def from_cicero( genomic_ac=self._get_genomic_ac(chr_5prime, rb), seg_end_genomic=pos_5prime, gene=gene_5prime, + coverage=BreakpointCoverage(fragmentCoverage=coverage_a), + reads=AnchoredReads(reads=reads_a), coordinate_type=coordinate_type, starting_assembly=rb, ) @@ -580,6 +687,8 @@ async def from_cicero( genomic_ac=self._get_genomic_ac(chr_3prime, rb), seg_start_genomic=pos_3prime, gene=gene_3prime, + coverage=BreakpointCoverage(fragmentCoverage=coverage_b), + reads=AnchoredReads(reads=reads_b), coordinate_type=coordinate_type, starting_assembly=rb, ) @@ -594,12 +703,15 @@ async def from_cicero( eventType=EventType("rearrangement"), eventDescription=event_type, ) + contig = ContigSequence(contig=contig) + return self._format_fusion( gene_5prime_element, gene_3prime_element, tr_5prime, tr_3prime, ce, + contig=contig, ) async def from_mapsplice( diff --git a/tests/test_translators.py b/tests/test_translators.py index 0be503e..0d4beb3 100644 --- a/tests/test_translators.py +++ b/tests/test_translators.py @@ -4,108 +4,126 @@ import pytest from cool_seq_tool.schemas import Assembly, CoordinateType -from fusor.models import AssayedFusion +from fusor.models import ( + AnchoredReads, + AssayedFusion, + BreakpointCoverage, + ContigSequence, + ReadData, + SpanningReads, + SplitReads, +) from fusor.translator import Caller @pytest.fixture(scope="module") def fusion_data_example(): """Create example assayed fusion for TPM3::PDGFRB with exonic breakpoints""" - params = { - "type": "AssayedFusion", - "structure": [ - { - "type": "TranscriptSegmentElement", - "transcript": "refseq:NM_152263.4", - "exonEnd": 8, - "exonEndOffset": -66, - "gene": {"id": "hgnc:12012", "type": "Gene", "label": "TPM3"}, - "elementGenomicEnd": { - "id": "ga4gh:SL.6lXn5i3zqcZUfmtBSieTiVL4Nt2gPGKY", - "type": "SequenceLocation", - "digest": "6lXn5i3zqcZUfmtBSieTiVL4Nt2gPGKY", - "sequenceReference": { - "id": "refseq:NC_000001.11", - "type": "SequenceReference", - "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + + def _create_base_fixture(**kwargs): + params = { + "type": "AssayedFusion", + "structure": [ + { + "type": "TranscriptSegmentElement", + "transcript": "refseq:NM_152263.4", + "exonEnd": 8, + "exonEndOffset": -66, + "gene": {"id": "hgnc:12012", "type": "Gene", "label": "TPM3"}, + "elementGenomicEnd": { + "id": "ga4gh:SL.6lXn5i3zqcZUfmtBSieTiVL4Nt2gPGKY", + "type": "SequenceLocation", + "digest": "6lXn5i3zqcZUfmtBSieTiVL4Nt2gPGKY", + "sequenceReference": { + "id": "refseq:NC_000001.11", + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + "start": 154170465, }, - "start": 154170465, }, - }, - { - "type": "TranscriptSegmentElement", - "transcript": "refseq:NM_002609.4", - "exonStart": 11, - "exonStartOffset": 2, - "gene": {"id": "hgnc:8804", "type": "Gene", "label": "PDGFRB"}, - "elementGenomicStart": { - "id": "ga4gh:SL.Sp1lwuHbRCkWIoe4zzwVKPsS8zK8i0ck", - "type": "SequenceLocation", - "digest": "Sp1lwuHbRCkWIoe4zzwVKPsS8zK8i0ck", - "sequenceReference": { - "id": "refseq:NC_000005.10", - "type": "SequenceReference", - "refgetAccession": "SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI", + { + "type": "TranscriptSegmentElement", + "transcript": "refseq:NM_002609.4", + "exonStart": 11, + "exonStartOffset": 2, + "gene": {"id": "hgnc:8804", "type": "Gene", "label": "PDGFRB"}, + "elementGenomicStart": { + "id": "ga4gh:SL.Sp1lwuHbRCkWIoe4zzwVKPsS8zK8i0ck", + "type": "SequenceLocation", + "digest": "Sp1lwuHbRCkWIoe4zzwVKPsS8zK8i0ck", + "sequenceReference": { + "id": "refseq:NC_000005.10", + "type": "SequenceReference", + "refgetAccession": "SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI", + }, + "end": 150126612, }, - "end": 150126612, }, - }, - ], - "causativeEvent": {"type": "CausativeEvent", "eventType": "rearrangement"}, - "r_frame_preserved": True, - "assay": None, - } - return AssayedFusion(**params) + ], + "causativeEvent": {"type": "CausativeEvent", "eventType": "rearrangement"}, + "r_frame_preserved": True, + "assay": None, + } + assayed_fusion = AssayedFusion(**params) + return assayed_fusion.model_copy(update=kwargs) + + return _create_base_fixture @pytest.fixture(scope="module") def fusion_data_example_nonexonic(): """Create example assayed fusion for TPM3::PDGFRB with non-exonic breakpoints""" - params = { - "type": "AssayedFusion", - "structure": [ - { - "type": "TranscriptSegmentElement", - "transcript": "refseq:NM_152263.4", - "exonEnd": 4, - "exonEndOffset": 5, - "gene": {"id": "hgnc:12012", "type": "Gene", "label": "TPM3"}, - "elementGenomicEnd": { - "id": "ga4gh:SL.O1rVKQA2FTdy_FFWg3qJVSTG_TF_Mkex", - "type": "SequenceLocation", - "digest": "O1rVKQA2FTdy_FFWg3qJVSTG_TF_Mkex", - "sequenceReference": { - "id": "refseq:NC_000001.11", - "type": "SequenceReference", - "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + + def _create_base_fixture(**kwargs): + params = { + "type": "AssayedFusion", + "structure": [ + { + "type": "TranscriptSegmentElement", + "transcript": "refseq:NM_152263.4", + "exonEnd": 4, + "exonEndOffset": 5, + "gene": {"id": "hgnc:12012", "type": "Gene", "label": "TPM3"}, + "elementGenomicEnd": { + "id": "ga4gh:SL.O1rVKQA2FTdy_FFWg3qJVSTG_TF_Mkex", + "type": "SequenceLocation", + "digest": "O1rVKQA2FTdy_FFWg3qJVSTG_TF_Mkex", + "sequenceReference": { + "id": "refseq:NC_000001.11", + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + "start": 154173078, }, - "start": 154173078, }, - }, - { - "type": "TranscriptSegmentElement", - "transcript": "refseq:NM_002609.4", - "exonStart": 11, - "exonStartOffset": -559, - "gene": {"id": "hgnc:8804", "type": "Gene", "label": "PDGFRB"}, - "elementGenomicStart": { - "id": "ga4gh:SL.GtoWMuox4tOyX2I5L9Baobnpgc1pDIVJ", - "type": "SequenceLocation", - "digest": "GtoWMuox4tOyX2I5L9Baobnpgc1pDIVJ", - "sequenceReference": { - "id": "refseq:NC_000005.10", - "type": "SequenceReference", - "refgetAccession": "SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI", + { + "type": "TranscriptSegmentElement", + "transcript": "refseq:NM_002609.4", + "exonStart": 11, + "exonStartOffset": -559, + "gene": {"id": "hgnc:8804", "type": "Gene", "label": "PDGFRB"}, + "elementGenomicStart": { + "id": "ga4gh:SL.GtoWMuox4tOyX2I5L9Baobnpgc1pDIVJ", + "type": "SequenceLocation", + "digest": "GtoWMuox4tOyX2I5L9Baobnpgc1pDIVJ", + "sequenceReference": { + "id": "refseq:NC_000005.10", + "type": "SequenceReference", + "refgetAccession": "SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI", + }, + "end": 150127173, }, - "end": 150127173, }, - }, - ], - "causativeEvent": {"type": "CausativeEvent", "eventType": "rearrangement"}, - "r_frame_preserved": True, - "assay": None, - } - return AssayedFusion(**params) + ], + "causativeEvent": {"type": "CausativeEvent", "eventType": "rearrangement"}, + "r_frame_preserved": True, + "assay": None, + } + assayed_fusion = AssayedFusion(**params) + return assayed_fusion.model_copy(update=kwargs) + + return _create_base_fixture def test_gene_element_arriba(translator_instance): @@ -138,6 +156,8 @@ async def test_jaffa( rearrangement = True classification = "HighConfidence" inframe = True + spanning_reads = 100 + spanning_pairs = 80 jaffa_fusor = await translator_instance.from_jaffa( fusion_genes, @@ -148,10 +168,18 @@ async def test_jaffa( rearrangement, classification, inframe, + spanning_reads, + spanning_pairs, CoordinateType.INTER_RESIDUE.value, Assembly.GRCH38.value, ) + fusion_data_example = fusion_data_example( + readData=ReadData( + split=SplitReads(splitReads=100), spanning=SpanningReads(spanningReads=80) + ) + ) assert jaffa_fusor.structure == fusion_data_example.structure + assert jaffa_fusor.readData == fusion_data_example.readData # Test non-exonic breakpoint fusion_genes = "TPM3:PDGFRB" @@ -162,6 +190,8 @@ async def test_jaffa( rearrangement = True classification = "HighConfidence" inframe = True + spanning_reads = 100 + spanning_pairs = 80 jaffa_fusor_nonexonic = await translator_instance.from_jaffa( fusion_genes, @@ -172,10 +202,18 @@ async def test_jaffa( rearrangement, classification, inframe, + spanning_reads, + spanning_pairs, CoordinateType.RESIDUE.value, Assembly.GRCH38.value, ) + fusion_data_example_nonexonic = fusion_data_example_nonexonic( + readData=ReadData( + split=SplitReads(splitReads=100), spanning=SpanningReads(spanningReads=80) + ) + ) assert jaffa_fusor_nonexonic.structure == fusion_data_example_nonexonic.structure + assert jaffa_fusor_nonexonic.readData == fusion_data_example_nonexonic.readData @pytest.mark.asyncio() @@ -189,6 +227,8 @@ async def test_star_fusion( left_breakpoint = "chr1:154170465:-" right_breakpoint = "chr5:150126612:-" annots = '["INTERCHROMOSOMAL]' + junction_read_count = 100 + spanning_frag_count = 80 star_fusion_fusor = await translator_instance.from_star_fusion( left_gene, @@ -196,10 +236,18 @@ async def test_star_fusion( left_breakpoint, right_breakpoint, annots, + junction_read_count, + spanning_frag_count, CoordinateType.INTER_RESIDUE.value, Assembly.GRCH38.value, ) + fusion_data_example = fusion_data_example( + readData=ReadData( + split=SplitReads(splitReads=100), spanning=SpanningReads(spanningReads=80) + ) + ) assert star_fusion_fusor.structure == fusion_data_example.structure + assert star_fusion_fusor.readData == fusion_data_example.readData # Test non-exonic breakpoints left_gene = "TPM3^ENSG00000143549.19" @@ -207,6 +255,8 @@ async def test_star_fusion( left_breakpoint = "chr1:154173079:-" right_breakpoint = "chr5:150127173:-" annots = '["INTERCHROMOSOMAL]' + junction_read_count = 100 + spanning_frag_count = 80 star_fusion_fusor_nonexonic = await translator_instance.from_star_fusion( left_gene, @@ -214,12 +264,22 @@ async def test_star_fusion( left_breakpoint, right_breakpoint, annots, + junction_read_count, + spanning_frag_count, CoordinateType.RESIDUE.value, Assembly.GRCH38.value, ) + fusion_data_example_nonexonic = fusion_data_example_nonexonic( + readData=ReadData( + split=SplitReads(splitReads=100), spanning=SpanningReads(spanningReads=80) + ) + ) assert ( star_fusion_fusor_nonexonic.structure == fusion_data_example_nonexonic.structure ) + assert ( + star_fusion_fusor_nonexonic.readData == fusion_data_example_nonexonic.readData + ) @pytest.mark.asyncio() @@ -233,6 +293,9 @@ async def test_fusion_catcher( five_prime_fusion_point = "1:154170465:-" three_prime_fusion_point = "5:150126612:-" predicted_effect = "exonic(no-known-CDS)/exonic(no-known-CDS)" + spanning_unique_reads = 100 + spanning_reads = 80 + fusion_sequence = "CTAGATGAC*TACTACTA" fusion_catcher_fusor = await translator_instance.from_fusion_catcher( five_prime_partner, @@ -240,10 +303,21 @@ async def test_fusion_catcher( five_prime_fusion_point, three_prime_fusion_point, predicted_effect, + spanning_unique_reads, + spanning_reads, + fusion_sequence, CoordinateType.INTER_RESIDUE.value, Assembly.GRCH38.value, ) + fusion_data_example = fusion_data_example( + readData=ReadData( + split=SplitReads(splitReads=100), spanning=SpanningReads(spanningReads=80) + ), + contig=ContigSequence(contig="CTAGATGAC*TACTACTA"), + ) assert fusion_catcher_fusor.structure == fusion_data_example.structure + assert fusion_catcher_fusor.readData == fusion_data_example.readData + assert fusion_catcher_fusor.contig == fusion_catcher_fusor.contig # Test non-exonic breakpoint five_prime_partner = "TPM3" @@ -251,6 +325,9 @@ async def test_fusion_catcher( five_prime_fusion_point = "1:154173079:-" three_prime_fusion_point = "5:150127173:-" predicted_effect = "exonic(no-known-CDS)/exonic(no-known-CDS)" + spanning_unique_reads = 100 + spanning_reads = 80 + fusion_sequence = "CTAGATGAC*TACTACTA" fusion_catcher_fusor_nonexonic = await translator_instance.from_fusion_catcher( five_prime_partner, @@ -258,13 +335,27 @@ async def test_fusion_catcher( five_prime_fusion_point, three_prime_fusion_point, predicted_effect, + spanning_unique_reads, + spanning_reads, + fusion_sequence, CoordinateType.RESIDUE.value, Assembly.GRCH38.value, ) + fusion_data_example_nonexonic = fusion_data_example_nonexonic( + readData=ReadData( + split=SplitReads(splitReads=100), spanning=SpanningReads(spanningReads=80) + ), + contig=ContigSequence(contig="CTAGATGAC*TACTACTA"), + ) assert ( fusion_catcher_fusor_nonexonic.structure == fusion_data_example_nonexonic.structure ) + assert ( + fusion_catcher_fusor_nonexonic.readData + == fusion_data_example_nonexonic.readData + ) + assert fusion_catcher_fusor_nonexonic.contig == fusion_data_example_nonexonic.contig @pytest.mark.asyncio() @@ -289,7 +380,7 @@ async def test_fusion_map( fusion_map_fusor = await translator_instance.from_fusion_map( fusion_map_data, CoordinateType.INTER_RESIDUE.value, Assembly.GRCH38.value ) - assert fusion_map_fusor.structure == fusion_data_example.structure + assert fusion_map_fusor.structure == fusion_data_example().structure # Test non-exonic breakpoint fusion_map_data_nonexonic = pl.DataFrame( @@ -309,7 +400,8 @@ async def test_fusion_map( fusion_map_data_nonexonic, CoordinateType.RESIDUE.value, Assembly.GRCH38.value ) assert ( - fusion_map_fusor_nonexonic.structure == fusion_data_example_nonexonic.structure + fusion_map_fusor_nonexonic.structure + == fusion_data_example_nonexonic().structure ) @@ -330,6 +422,12 @@ async def test_arriba( direction1 = "upstream" direction2 = "downstream" rf = "in-frame" + split_reads1 = 100 + split_reads2 = 95 + discordant_mates = 30 + coverage1 = 200 + coverage2 = 190 + fusion_transcript = "CTAGATGAC_TACTACTA|GTACTACT" arriba_fusor = await translator_instance.from_arriba( gene1, @@ -343,10 +441,26 @@ async def test_arriba( direction1, direction2, rf, + split_reads1, + split_reads2, + discordant_mates, + coverage1, + coverage2, + fusion_transcript, CoordinateType.INTER_RESIDUE.value, Assembly.GRCH38.value, ) + fusion_data_example = fusion_data_example( + readData=ReadData(spanning=SpanningReads(spanningReads=30)), + contig=ContigSequence(contig=fusion_transcript), + ) + fusion_data_example.structure[0].coverage = BreakpointCoverage(fragmentCoverage=200) + fusion_data_example.structure[0].anchoredReads = AnchoredReads(reads=100) + fusion_data_example.structure[1].coverage = BreakpointCoverage(fragmentCoverage=190) + fusion_data_example.structure[1].anchoredReads = AnchoredReads(reads=95) assert arriba_fusor.structure == fusion_data_example.structure + assert arriba_fusor.readData == fusion_data_example.readData + assert arriba_fusor.contig == fusion_data_example.contig # Test non-exonic breakpoint gene1 = "TPM3" @@ -360,6 +474,11 @@ async def test_arriba( direction1 = "upstream" direction2 = "downstream" rf = "in-frame" + split_reads1 = 100 + split_reads2 = 95 + discordant_mates = 30 + coverage1 = 200 + coverage2 = 190 arriba_fusor_nonexonic = await translator_instance.from_arriba( gene1, @@ -373,10 +492,30 @@ async def test_arriba( direction1, direction2, rf, + split_reads1, + split_reads2, + discordant_mates, + coverage1, + coverage2, + fusion_transcript, CoordinateType.RESIDUE.value, Assembly.GRCH38.value, ) + fusion_data_example_nonexonic = fusion_data_example_nonexonic( + readData=ReadData(spanning=SpanningReads(spanningReads=30)), + contig=ContigSequence(contig=fusion_transcript), + ) + fusion_data_example_nonexonic.structure[0].coverage = BreakpointCoverage( + fragmentCoverage=200 + ) + fusion_data_example_nonexonic.structure[0].anchoredReads = AnchoredReads(reads=100) + fusion_data_example_nonexonic.structure[1].coverage = BreakpointCoverage( + fragmentCoverage=190 + ) + fusion_data_example_nonexonic.structure[1].anchoredReads = AnchoredReads(reads=95) assert arriba_fusor_nonexonic.structure == fusion_data_example_nonexonic.structure + assert arriba_fusor_nonexonic.readData == fusion_data_example_nonexonic.readData + assert arriba_fusor_nonexonic.contig == fusion_data_example_nonexonic.contig @pytest.mark.asyncio() @@ -393,6 +532,11 @@ async def test_cicero( pos_3prime = 150126612 sv_ort = ">" event_type = "CTX" + reads_a = 100 + reads_b = 90 + coverage_a = 200 + coverage_b = 190 + contig = "ATCATACTAGATACTACTACGATGAGAGAGTACATAGAT" cicero_fusor = await translator_instance.from_cicero( gene_5prime, @@ -403,10 +547,22 @@ async def test_cicero( pos_3prime, sv_ort, event_type, + reads_a, + reads_b, + coverage_a, + coverage_b, + contig, CoordinateType.INTER_RESIDUE.value, Assembly.GRCH38.value, ) + fusion_data_example = fusion_data_example(contig=ContigSequence(contig=contig)) + fusion_data_example.structure[0].coverage = BreakpointCoverage(fragmentCoverage=200) + fusion_data_example.structure[0].anchoredReads = AnchoredReads(reads=100) + fusion_data_example.structure[1].coverage = BreakpointCoverage(fragmentCoverage=190) + fusion_data_example.structure[1].anchoredReads = AnchoredReads(reads=90) assert cicero_fusor.structure == fusion_data_example.structure + assert cicero_fusor.readData == fusion_data_example.readData + assert cicero_fusor.contig == fusion_data_example.contig # Test non-exonic breakpoint gene_5prime = "TPM3" @@ -417,6 +573,11 @@ async def test_cicero( pos_3prime = 150127173 sv_ort = ">" event_type = "CTX" + reads_a = 100 + reads_b = 90 + coverage_a = 200 + coverage_b = 190 + contig = "ATCATACTAGATACTACTACGATGAGAGAGTACATAGAT" cicero_fusor_nonexonic = await translator_instance.from_cicero( gene_5prime, @@ -427,10 +588,28 @@ async def test_cicero( pos_3prime, sv_ort, event_type, + reads_a, + reads_b, + coverage_a, + coverage_b, + contig, CoordinateType.RESIDUE.value, Assembly.GRCH38.value, ) + fusion_data_example_nonexonic = fusion_data_example_nonexonic( + contig=ContigSequence(contig=contig) + ) + fusion_data_example_nonexonic.structure[0].coverage = BreakpointCoverage( + fragmentCoverage=200 + ) + fusion_data_example_nonexonic.structure[0].anchoredReads = AnchoredReads(reads=100) + fusion_data_example_nonexonic.structure[1].coverage = BreakpointCoverage( + fragmentCoverage=190 + ) + fusion_data_example_nonexonic.structure[1].anchoredReads = AnchoredReads(reads=90) assert cicero_fusor_nonexonic.structure == fusion_data_example_nonexonic.structure + assert cicero_fusor_nonexonic.readData == fusion_data_example_nonexonic.readData + assert cicero_fusor_nonexonic.contig == fusion_data_example_nonexonic.contig # Test case where the called fusion does not have confident biological meaning gene_5prime = "TPM3" @@ -441,6 +620,11 @@ async def test_cicero( pos_3prime = 150127173 sv_ort = "?" event_type = "CTX" + reads_a = 100 + reads_b = 90 + coverage_a = 200 + coverage_b = 190 + contig = "ATCATACTAGATACTACTACGATGAGAGAGTACATAGAT" non_confident_bio = await translator_instance.from_cicero( gene_5prime, @@ -451,6 +635,11 @@ async def test_cicero( pos_3prime, sv_ort, event_type, + reads_a, + reads_b, + coverage_a, + coverage_b, + contig, CoordinateType.RESIDUE.value, Assembly.GRCH38.value, ) @@ -468,6 +657,11 @@ async def test_cicero( pos_3prime = 150127173 sv_ort = "?" event_type = "CTX" + reads_a = 100 + reads_b = 90 + coverage_a = 200 + coverage_b = 190 + contig = "ATCATACTAGATACTACTACGATGAGAGAGTACATAGAT" multiple_genes_fusion_partner = await translator_instance.from_cicero( gene_5prime, @@ -478,6 +672,11 @@ async def test_cicero( pos_3prime, sv_ort, event_type, + reads_a, + reads_b, + coverage_a, + coverage_b, + contig, CoordinateType.RESIDUE.value, Assembly.GRCH38.value, ) @@ -510,7 +709,7 @@ async def test_enfusion( CoordinateType.INTER_RESIDUE.value, Assembly.GRCH38.value, ) - assert enfusion_fusor.structure == fusion_data_example.structure + assert enfusion_fusor.structure == fusion_data_example().structure # Test non-exonic breakpoint gene_5prime = "TPM3" @@ -530,7 +729,9 @@ async def test_enfusion( CoordinateType.RESIDUE.value, Assembly.GRCH38.value, ) - assert enfusion_fusor_nonexonic.structure == fusion_data_example_nonexonic.structure + assert ( + enfusion_fusor_nonexonic.structure == fusion_data_example_nonexonic().structure + ) @pytest.mark.asyncio() @@ -560,7 +761,7 @@ async def test_genie( CoordinateType.INTER_RESIDUE.value, Assembly.GRCH38.value, ) - assert genie_fusor.structure == fusion_data_example.structure + assert genie_fusor.structure == fusion_data_example().structure # Test non-exonic breakpoint site1_hugo = "TPM3" @@ -584,4 +785,4 @@ async def test_genie( CoordinateType.RESIDUE.value, Assembly.GRCH38.value, ) - assert genie_fusor_nonexonic.structure == fusion_data_example_nonexonic.structure + assert genie_fusor_nonexonic.structure == fusion_data_example_nonexonic().structure From 3d9218073ccd6850ef55c2ff3d7b41523e0f3327 Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld Date: Thu, 16 Jan 2025 10:39:16 -0500 Subject: [PATCH 03/20] Add class for JAFFA --- src/fusor/fusion_caller_models.py | 54 +++++++++++++++++++++++++ src/fusor/translator.py | 47 ++++++---------------- tests/test_translators.py | 67 +++++++++++++------------------ 3 files changed, 94 insertions(+), 74 deletions(-) create mode 100644 src/fusor/fusion_caller_models.py diff --git a/src/fusor/fusion_caller_models.py b/src/fusor/fusion_caller_models.py new file mode 100644 index 0000000..ab34e75 --- /dev/null +++ b/src/fusor/fusion_caller_models.py @@ -0,0 +1,54 @@ +"""Schemas for fusion callers used in translator.py""" + +from enum import Enum +from typing import Literal + +from pydantic import BaseModel, Field + + +class BaseModelForbidExtra(BaseModel, extra="forbid"): + """Base Pydantic model class with extra values forbidden.""" + + +class FusionCallerTypes(str, Enum): + """Define FusionCaller type values""" + + JAFFA = "JAFFA" + + +class JAFFA(BaseModel): + """Define parameters for JAFFA model""" + + type: Literal[FusionCallerTypes.JAFFA] = FusionCallerTypes.JAFFA + fusion_genes: str = Field( + ..., description="A string containing the two fusion partners" + ) + chrom1: str = Field( + ..., description="The chromosome indicated in the chrom1 column" + ) + base1: int = Field( + ..., description="The genomic position indicated in the base1 column" + ) + chrom2: str = Field( + ..., description="The chromosome indicated in the chrom2 column" + ) + base2: int = Field( + ..., description="The genomic position indicated in the base2 column" + ) + rearrangement: bool = Field( + ..., description=" A boolean indicating if a rearrangement occured" + ) + classification: str = Field( + ..., description="The classification associated with the called fusion" + ) + inframe: bool = Field( + ..., description="A boolean indicating if the fusion occurred in-frame" + ) + spanning_reads: int = Field( + ..., + description="The number of deteced reads that span the junction bewtween the two transcript. Although described as spanning reads, this aligns with our defintion of split reads i.e. reads that have sequence belonging to the fusion partners", + ) + spanning_pairs: int = Field( + ..., + description="The number of detected reads that align entirely on either side of the breakpoint", + ) diff --git a/src/fusor/translator.py b/src/fusor/translator.py index 60284d3..1e1fe8a 100644 --- a/src/fusor/translator.py +++ b/src/fusor/translator.py @@ -8,6 +8,7 @@ import polars as pl from cool_seq_tool.schemas import Assembly, CoordinateType +from fusor.fusion_caller_models import JAFFA from fusor.fusor import FUSOR from fusor.models import ( AnchoredReads, @@ -192,40 +193,18 @@ def _get_genomic_ac(self, chrom: str, build: Assembly) -> str: async def from_jaffa( self, - fusion_genes: str, - chrom1: str, - base1: int, - chrom2: str, - base2: int, - rearrangement: bool, - classification: str, - inframe: bool, - spanning_reads: int, - spanning_pairs: int, + jaffa: JAFFA, coordinate_type: CoordinateType, rb: Assembly, ) -> AssayedFusion | None: """Parse JAFFA fusion output to create AssayedFusion object - :param fusion_genes: A string containing the two fusion partners - :param chrom1: The chromosome indicated in the chrom1 column - :param base1: The genomic position indicated in the base1 column - :param chrom2: The chromosome indicated in the chrom2 column - :param base2: The genomic position indicated in the base2 column - :param rearrangement: A boolean indicating if a rearrangement occured - :param classification: The classification associated with the called fusion - :param inframe: A boolean indicating if the fusion occurred in-frame - :param spanning_reads: The number of deteced reads that span the junction - bewtween the two transcript. Although described as spanning reads, this - aligns with our defintion of split reads (i.e. reads that have sequence - belonging to the fusion partners) - :param spanning_pairs: The number of detected reads that align entirely on - either side of the breakpoint + :param JAFFA: A JAFFA object :param coordinate_type: If the coordinate is inter-residue or residue :param rb: The reference build used to call the fusion :return: An AssayedFusion object, if construction is successful """ - genes = fusion_genes.split(":") + genes = jaffa.fusion_genes.split(":") gene_5prime_element = self._get_gene_element(genes[0], Caller.JAFFA) gene_3prime_element = self._get_gene_element(genes[1], Caller.JAFFA) gene_5prime = gene_5prime_element.gene.label @@ -236,8 +215,8 @@ async def from_jaffa( tr_5prime = await self.fusor.transcript_segment_element( tx_to_genomic_coords=False, - genomic_ac=self._get_genomic_ac(chrom1, rb), - seg_end_genomic=base1, + genomic_ac=self._get_genomic_ac(jaffa.chrom1, rb), + seg_end_genomic=jaffa.base1, gene=gene_5prime, coordinate_type=coordinate_type, starting_assembly=rb, @@ -245,24 +224,24 @@ async def from_jaffa( tr_3prime = await self.fusor.transcript_segment_element( tx_to_genomic_coords=False, - genomic_ac=self._get_genomic_ac(chrom2, rb), - seg_start_genomic=base2, + genomic_ac=self._get_genomic_ac(jaffa.chrom2, rb), + seg_start_genomic=jaffa.base2, gene=gene_3prime, coordinate_type=coordinate_type, starting_assembly=rb, ) - if rearrangement: + if jaffa.rearrangement: ce = CausativeEvent( eventType=EventType("rearrangement"), - eventDescription=classification, + eventDescription=jaffa.classification, ) else: ce = None read_data = ReadData( - split=SplitReads(splitReads=spanning_reads), - spanning=SpanningReads(spanningReads=spanning_pairs), + split=SplitReads(splitReads=jaffa.spanning_reads), + spanning=SpanningReads(spanningReads=jaffa.spanning_pairs), ) return self._format_fusion( @@ -271,7 +250,7 @@ async def from_jaffa( tr_5prime, tr_3prime, ce, - inframe, + jaffa.inframe, reads=read_data, ) diff --git a/tests/test_translators.py b/tests/test_translators.py index 0d4beb3..b06194a 100644 --- a/tests/test_translators.py +++ b/tests/test_translators.py @@ -4,6 +4,7 @@ import pytest from cool_seq_tool.schemas import Assembly, CoordinateType +from fusor.fusion_caller_models import JAFFA from fusor.models import ( AnchoredReads, AssayedFusion, @@ -148,28 +149,21 @@ async def test_jaffa( ): """Test JAFFA translator""" # Test exonic breakpoint - fusion_genes = "TPM3:PDGFRB" - chrom1 = "chr1" - base1 = 154170465 - chrom2 = "chr5" - base2 = 150126612 - rearrangement = True - classification = "HighConfidence" - inframe = True - spanning_reads = 100 - spanning_pairs = 80 + jaffa = JAFFA( + fusion_genes="TPM3:PDGFRB", + chrom1="chr1", + base1=154170465, + chrom2="chr5", + base2=150126612, + rearrangement=True, + classification="HighConfidence", + inframe=True, + spanning_reads=100, + spanning_pairs=80, + ) jaffa_fusor = await translator_instance.from_jaffa( - fusion_genes, - chrom1, - base1, - chrom2, - base2, - rearrangement, - classification, - inframe, - spanning_reads, - spanning_pairs, + jaffa, CoordinateType.INTER_RESIDUE.value, Assembly.GRCH38.value, ) @@ -182,28 +176,21 @@ async def test_jaffa( assert jaffa_fusor.readData == fusion_data_example.readData # Test non-exonic breakpoint - fusion_genes = "TPM3:PDGFRB" - chrom1 = "chr1" - base1 = 154173079 - chrom2 = "chr5" - base2 = 150127173 - rearrangement = True - classification = "HighConfidence" - inframe = True - spanning_reads = 100 - spanning_pairs = 80 + jaffa = JAFFA( + fusion_genes="TPM3:PDGFRB", + chrom1="chr1", + base1=154173079, + chrom2="chr5", + base2=150127173, + rearrangement=True, + classification="HighConfidence", + inframe=True, + spanning_reads=100, + spanning_pairs=80, + ) jaffa_fusor_nonexonic = await translator_instance.from_jaffa( - fusion_genes, - chrom1, - base1, - chrom2, - base2, - rearrangement, - classification, - inframe, - spanning_reads, - spanning_pairs, + jaffa, CoordinateType.RESIDUE.value, Assembly.GRCH38.value, ) From 2a7fbf034225f4fe5c64d25b1db22d96722d38f9 Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld Date: Thu, 16 Jan 2025 10:51:52 -0500 Subject: [PATCH 04/20] Add STAR-Fusion --- src/fusor/fusion_caller_models.py | 39 +++++++++++++++++++++-- src/fusor/translator.py | 52 +++++++------------------------ tests/test_translators.py | 51 +++++++++++++----------------- 3 files changed, 69 insertions(+), 73 deletions(-) diff --git a/src/fusor/fusion_caller_models.py b/src/fusor/fusion_caller_models.py index ab34e75..7430d5b 100644 --- a/src/fusor/fusion_caller_models.py +++ b/src/fusor/fusion_caller_models.py @@ -10,16 +10,24 @@ class BaseModelForbidExtra(BaseModel, extra="forbid"): """Base Pydantic model class with extra values forbidden.""" -class FusionCallerTypes(str, Enum): - """Define FusionCaller type values""" +class Caller(str, Enum): + """Define different supported callers""" JAFFA = "JAFFA" + STAR_FUSION = "STAR-Fusion" + FUSION_CATCHER = "FusionCatcher" + FUSION_MAP = "FusionMap" + ARRIBA = "Arriba" + CICERO = "CICERO" + MAPSPLICE = "MapSplice" + ENFUSION = "EnFusion" + GENIE = "GENIE" class JAFFA(BaseModel): """Define parameters for JAFFA model""" - type: Literal[FusionCallerTypes.JAFFA] = FusionCallerTypes.JAFFA + type: Literal[Caller.JAFFA] = Caller.JAFFA fusion_genes: str = Field( ..., description="A string containing the two fusion partners" ) @@ -52,3 +60,28 @@ class JAFFA(BaseModel): ..., description="The number of detected reads that align entirely on either side of the breakpoint", ) + + +class STARFusion(BaseModel): + """Define parameters for STAR-Fusion model""" + + type: Literal[Caller.STAR_FUSION] = Caller.STAR_FUSION + left_gene: str = Field(..., description="The gene indicated in the LeftGene column") + right_gene: str = Field( + ..., description="The gene indicated in the RightGene column" + ) + left_breakpoint: str = Field( + ..., description="The gene indicated in the LeftBreakpoint column" + ) + right_breakpoint: str = Field( + ..., description="The gene indicated in the RightBreakpoint column" + ) + annots: str = Field(..., description="The annotations associated with the fusion") + junction_read_count: int = Field( + ..., + description="The number of RNA-seq fragments that split the junction between the two transcript segments (from STAR-Fusion documentation)", + ) + spanning_frag_count: int = Field( + ..., + description="The number of RNA-seq fragments that encompass the fusion junction such that one read of the pair aligns to a different gene than the other paired-end read of that fragment (from STAR-Fusion documentation)", + ) diff --git a/src/fusor/translator.py b/src/fusor/translator.py index 1e1fe8a..c6770e3 100644 --- a/src/fusor/translator.py +++ b/src/fusor/translator.py @@ -3,12 +3,11 @@ """ import logging -from enum import Enum import polars as pl from cool_seq_tool.schemas import Assembly, CoordinateType -from fusor.fusion_caller_models import JAFFA +from fusor.fusion_caller_models import JAFFA, Caller, STARFusion from fusor.fusor import FUSOR from fusor.models import ( AnchoredReads, @@ -28,20 +27,6 @@ _logger = logging.getLogger(__name__) -class Caller(str, Enum): - """Define different supported callers""" - - JAFFA = "JAFFA" - STAR_FUSION = "STAR-Fusion" - FUSION_CATCHER = "FusionCatcher" - FUSION_MAP = "FusionMap" - ARRIBA = "Arriba" - CICERO = "CICERO" - MAPSPLICE = "MapSplice" - ENFUSION = "EnFusion" - GENIE = "GENIE" - - class Translator: """Class for translating outputs from different fusion detection algorithms to FUSOR AssayedFusion objects @@ -256,34 +241,19 @@ async def from_jaffa( async def from_star_fusion( self, - left_gene: str, - right_gene: str, - left_breakpoint: str, - right_breakpoint: str, - annots: str, - junction_read_count: int, - spanning_frag_count: int, + star_fusion: STARFusion, coordinate_type: CoordinateType, rb: Assembly, ) -> AssayedFusion: """Parse STAR-Fusion output to create AssayedFusion object - :param left_gene: The gene indicated in the LeftGene column - :param right_gene: The gene indicated in the RightGene column - :param left_breakpoint: The gene indicated in the LeftBreakpoint column - :param right_breakpoint: The gene indicated in the RightBreakpoint column - :param annots: The annotations associated with the fusion - :param junction_read_count: The number of RNA-seq fragments that split the - junction between the two transcript segments (from STAR-Fusion documentation) - :param spanning_frag_count: The number of RNA-seq fragments that encompass the - fusion junction such that one read of the pair aligns to a different gene - than the other paired-end read of that fragment (from STAR-Fusion documentation) + :param star_fusion: A STARFusion caller object :param coordinate_type: If the coordinate is inter-residue or residue :param rb: The reference build used to call the fusion :return: An AssayedFusion object, if construction is successful """ - gene1 = left_gene.split("^")[0] - gene2 = right_gene.split("^")[0] + gene1 = star_fusion.left_gene.split("^")[0] + gene2 = star_fusion.right_gene.split("^")[0] gene_5prime_element = self._get_gene_element(gene1, Caller.STAR_FUSION) gene_3prime_element = self._get_gene_element(gene2, Caller.STAR_FUSION) gene_5prime = gene_5prime_element.gene.label @@ -292,8 +262,8 @@ async def from_star_fusion( if not self._are_fusion_partners_different(gene_5prime, gene_3prime): return None - five_prime = left_breakpoint.split(":") - three_prime = right_breakpoint.split(":") + five_prime = star_fusion.left_breakpoint.split(":") + three_prime = star_fusion.right_breakpoint.split(":") tr_5prime = await self.fusor.transcript_segment_element( tx_to_genomic_coords=False, @@ -313,10 +283,12 @@ async def from_star_fusion( starting_assembly=rb, ) - ce = self._get_causative_event(five_prime[0], three_prime[0], ",".join(annots)) + ce = self._get_causative_event( + five_prime[0], three_prime[0], ",".join(star_fusion.annots) + ) read_data = ReadData( - split=SplitReads(splitReads=junction_read_count), - spanning=SpanningReads(spanningReads=spanning_frag_count), + split=SplitReads(splitReads=star_fusion.junction_read_count), + spanning=SpanningReads(spanningReads=star_fusion.spanning_frag_count), ) return self._format_fusion( diff --git a/tests/test_translators.py b/tests/test_translators.py index b06194a..1eddcd2 100644 --- a/tests/test_translators.py +++ b/tests/test_translators.py @@ -4,7 +4,7 @@ import pytest from cool_seq_tool.schemas import Assembly, CoordinateType -from fusor.fusion_caller_models import JAFFA +from fusor.fusion_caller_models import JAFFA, Caller, STARFusion from fusor.models import ( AnchoredReads, AssayedFusion, @@ -14,7 +14,6 @@ SpanningReads, SplitReads, ) -from fusor.translator import Caller @pytest.fixture(scope="module") @@ -209,22 +208,18 @@ async def test_star_fusion( ): """Test STAR-Fusion translator""" # Test exonic breakpoints - left_gene = "TPM3^ENSG00000143549.19" - right_gene = "PDGFRB^ENSG00000113721" - left_breakpoint = "chr1:154170465:-" - right_breakpoint = "chr5:150126612:-" - annots = '["INTERCHROMOSOMAL]' - junction_read_count = 100 - spanning_frag_count = 80 + star_fusion = STARFusion( + left_gene="TPM3^ENSG00000143549.19", + right_gene="PDGFRB^ENSG00000113721", + left_breakpoint="chr1:154170465:-", + right_breakpoint="chr5:150126612:-", + annots='["INTERCHROMOSOMAL]', + junction_read_count=100, + spanning_frag_count=80, + ) star_fusion_fusor = await translator_instance.from_star_fusion( - left_gene, - right_gene, - left_breakpoint, - right_breakpoint, - annots, - junction_read_count, - spanning_frag_count, + star_fusion, CoordinateType.INTER_RESIDUE.value, Assembly.GRCH38.value, ) @@ -237,22 +232,18 @@ async def test_star_fusion( assert star_fusion_fusor.readData == fusion_data_example.readData # Test non-exonic breakpoints - left_gene = "TPM3^ENSG00000143549.19" - right_gene = "PDGFRB^ENSG00000113721" - left_breakpoint = "chr1:154173079:-" - right_breakpoint = "chr5:150127173:-" - annots = '["INTERCHROMOSOMAL]' - junction_read_count = 100 - spanning_frag_count = 80 + star_fusion = STARFusion( + left_gene="TPM3^ENSG00000143549.19", + right_gene="PDGFRB^ENSG00000113721", + left_breakpoint="chr1:154173079:-", + right_breakpoint="chr5:150127173:-", + annots='["INTERCHROMOSOMAL]', + junction_read_count=100, + spanning_frag_count=80, + ) star_fusion_fusor_nonexonic = await translator_instance.from_star_fusion( - left_gene, - right_gene, - left_breakpoint, - right_breakpoint, - annots, - junction_read_count, - spanning_frag_count, + star_fusion, CoordinateType.RESIDUE.value, Assembly.GRCH38.value, ) From f52b36f805850efac89fc36e2c4b322408f007aa Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld Date: Thu, 16 Jan 2025 12:19:42 -0500 Subject: [PATCH 05/20] Store work --- src/fusor/fusion_caller_models.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/fusor/fusion_caller_models.py b/src/fusor/fusion_caller_models.py index 7430d5b..b0bc7b2 100644 --- a/src/fusor/fusion_caller_models.py +++ b/src/fusor/fusion_caller_models.py @@ -85,3 +85,24 @@ class STARFusion(BaseModel): ..., description="The number of RNA-seq fragments that encompass the fusion junction such that one read of the pair aligns to a different gene than the other paired-end read of that fragment (from STAR-Fusion documentation)", ) + + +class FusionCatcher(BaseModel): + """Define parameters for FusionCatcher model""" + + type: Literal[Caller.FUSION_CATCHER] = Caller.FUSION_CATCHER + five_prime_partner: str = Field( + ..., description="Gene symbol for the 5' fusion partner" + ) + three_prime_partner: str = Field( + ..., description="Gene symbol for the 3' fusion partner" + ) + five_prime_fusion_point: str = Field( + ..., + description="Chromosomal position for the 5' end of the fusion junction. This coordinate is 1-based", + ) + three_prime_fusion_point: str + predicted_effect: str + spanning_unique_reads: int + spanning_reads: int + fusion_sequence: str From cf1dafa053881355c66444eab482d90e8da3b919 Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld Date: Thu, 16 Jan 2025 14:03:38 -0500 Subject: [PATCH 06/20] Add pydantic classes for remaining translators --- src/fusor/fusion_caller_models.py | 140 ++++++++++- src/fusor/translator.py | 262 +++++++------------- tests/test_translators.py | 392 ++++++++---------------------- 3 files changed, 328 insertions(+), 466 deletions(-) diff --git a/src/fusor/fusion_caller_models.py b/src/fusor/fusion_caller_models.py index b0bc7b2..309b5c4 100644 --- a/src/fusor/fusion_caller_models.py +++ b/src/fusor/fusion_caller_models.py @@ -101,8 +101,138 @@ class FusionCatcher(BaseModel): ..., description="Chromosomal position for the 5' end of the fusion junction. This coordinate is 1-based", ) - three_prime_fusion_point: str - predicted_effect: str - spanning_unique_reads: int - spanning_reads: int - fusion_sequence: str + three_prime_fusion_point: str = Field( + ..., + description="Chromosomal position for the 3' end of the fusion junction. This coordinate is 1-based", + ) + predicted_effect: str = Field( + ..., + description="The predicted effect of the fusion event, created using annotation from the Ensembl database", + ) + spanning_unique_reads: int = Field( + ..., description="The number of unique reads that map on the fusion junction" + ) + spanning_reads: int = Field( + ..., description="The number of paired reads that support the fusion" + ) + fusion_sequence: str = Field( + ..., description="The inferred sequence around the fusion junction" + ) + + +class Arriba(BaseModel): + """Define parameters for Arriba model""" + + type: Literal[Caller.ARRIBA] = Caller.ARRIBA + gene1: str = Field(..., description="The 5' gene fusion partner") + gene2: str = Field(..., description="The 3' gene fusion partner") + strand1: str = Field( + ..., description="The strand information for the 5' gene fusion partner" + ) + strand2: str = Field( + ..., description="The strand information for the 3' gene fusion partner" + ) + breakpoint1: str = Field(..., description="The chromosome and breakpoint for gene1") + breakpoint2: str = Field(..., description="The chromosome and breakpoint for gene2") + event: str = Field(..., description=" An inference about the type of fusion event") + confidence: str = Field( + ..., description="A metric describing the confidence of the fusion prediction" + ) + direction1: str = Field( + ..., + description="A description that indicates if the transcript segment starts or ends at breakpoint1", + ) + direction2: str = Field( + ..., + description="A description that indicates if the transcript segment starts or ends at breakpoint2", + ) + rf: str = Field( + ..., + description="A description if the reading frame is preserved for the fusion", + ) + split_reads1: int = Field( + ..., description="Number of supporting split fragments with anchor in gene1" + ) + split_reads2: int = Field( + ..., description="Number of supporting split fragments with anchor in gene2" + ) + discordant_mates: int = Field( + ..., description="Number of discordant mates supporting the fusion" + ) + coverage1: int = Field( + ..., description="Number of fragments retained near breakpoint1" + ) + coverage2: int = Field( + ..., description="Number of fragments retained near breakpoint2" + ) + fusion_transcript: str = Field(..., description="The assembled fusion transcript") + + +class Cicero(BaseModel): + """Define parameters for CICERO model""" + + type: Literal[Caller.CICERO] = Caller.CICERO + gene_5prime: str = Field(..., description="The gene symbol for the 5' partner") + gene_3prime: str = Field(..., description="The gene symbol for the 3' partner") + chr_5prime: str = Field(..., description="The chromosome for the 5' partner") + chr_3prime: str = Field(..., description="The chromosome for the 3' partner") + pos_5prime: int = Field( + ..., description="The genomic breakpoint for the 5' partner" + ) + pos_3prime: int = Field( + ..., description="The genomic breakpoint for the 3' partner" + ) + sv_ort: str = Field( + ..., + description="Whether the mapping orientation of assembled contig (driven by structural variation) has confident biological meaning", + ) + event_type: str = Field( + ..., description="The structural variation event that created the called fusion" + ) + reads_5prime: int = Field( + ..., + description="The number of reads that support the breakpoint for the 5' partner", + ) + reads_3prime: int = Field( + ..., + description="The number of reads that support the breakpoint for the 3' partner", + ) + coverage_5prime: int = Field( + ..., description="The fragment coverage at the 5' breakpoint" + ) + coverage_3prime: int = Field( + ..., description="The fragment coverage at the 3' breakpoint" + ) + contig: str = Field(..., description="The assembled contig sequence for the fusion") + + +class EnFusion(BaseModel): + """Define parameters for EnFusion model""" + + type: Literal[Caller.ENFUSION] = Caller.ENFUSION + gene_5prime: str = Field(..., description="The 5' gene fusion partner") + gene_3prime: str = Field(..., description="The 3' gene fusion partner") + chr_5prime: int = Field(..., description="The 5' gene fusion partner chromosome") + chr_3prime: int = Field(..., description="The 3' gene fusion partner chromosome") + break_5prime: int = Field( + ..., description="The 5' gene fusion partner genomic breakpoint" + ) + break_3prime: int = Field( + ..., description="The 3' gene fusion partner genomic breakpoint" + ) + + +class Genie(BaseModel): + """Define parameters for Genie model""" + + type: Literal[Caller.GENIE] = Caller.GENIE + site1_hugo: str = Field(..., description="The HUGO symbol reported at site 1") + site2_hugo: str = Field(..., description="The HUGO symbol reported at site 2") + site1_chrom: int = Field(..., description="The chromosome reported at site 1") + site2_chrom: int = Field(..., description="The chromosome reported at site 2") + site1_pos: int = Field(..., description="The breakpoint reported at site 1") + site2_pos: int = Field(..., description="The breakpoint reported at site 2") + annot: str = Field(..., description="The annotation for the fusion event") + reading_frame: str = Field( + ..., description="The reading frame status of the fusion" + ) diff --git a/src/fusor/translator.py b/src/fusor/translator.py index c6770e3..86998cb 100644 --- a/src/fusor/translator.py +++ b/src/fusor/translator.py @@ -7,7 +7,16 @@ import polars as pl from cool_seq_tool.schemas import Assembly, CoordinateType -from fusor.fusion_caller_models import JAFFA, Caller, STARFusion +from fusor.fusion_caller_models import ( + JAFFA, + Arriba, + Caller, + Cicero, + EnFusion, + FusionCatcher, + Genie, + STARFusion, +) from fusor.fusor import FUSOR from fusor.models import ( AnchoredReads, @@ -302,48 +311,30 @@ async def from_star_fusion( async def from_fusion_catcher( self, - five_prime_partner: str, - three_prime_partner: str, - five_prime_fusion_point: str, - three_prime_fusion_point: str, - predicted_effect: str, - spanning_unique_reads: int, - spanning_reads: int, - fusion_sequence: str, + fusion_catcher: FusionCatcher, coordinate_type: CoordinateType, rb: Assembly, ) -> AssayedFusion: """Parse FusionCatcher output to create AssayedFusion object - :param five_prime_partner: Gene symbol for the 5' fusion partner - :param three_prime_partner: Gene symbol for the 3' fusion partner - :param five_prime_fusion_point: Chromosomal position for the 5' end of the - fusion junction. This coordinate is 1-based - :param three_prime_fusion_point: Chromosomal position for the 3' end of the - fusion junction. This coordinate is 1-based - :param predicted_effect: The predicted effect of the fusion event, created - using annotation from the Ensembl database - :param spanning_unique_reads: The number of unique reads that map on the fusion - junction - :param spanning_reads: The number of paired reads that support the fusion - :param fusion_sequence: The inferred sequence around the fusion junction + :param fusion_catcher: A FusionCatcher object :param coordinate_type: If the coordinate is inter-residue or residue :param rb: The reference build used to call the fusion :return: An AssayedFusion object, if construction is successful """ gene_5prime_element = self._get_gene_element( - five_prime_partner, Caller.FUSION_CATCHER + fusion_catcher.five_prime_partner, Caller.FUSION_CATCHER ) gene_3prime_element = self._get_gene_element( - three_prime_partner, Caller.FUSION_CATCHER + fusion_catcher.three_prime_partner, Caller.FUSION_CATCHER ) if not self._are_fusion_partners_different( gene_5prime_element.gene.label, gene_3prime_element.gene.label ): return None - five_prime = five_prime_fusion_point.split(":") - three_prime = three_prime_fusion_point.split(":") + five_prime = fusion_catcher.five_prime_fusion_point.split(":") + three_prime = fusion_catcher.three_prime_fusion_point.split(":") tr_5prime = await self.fusor.transcript_segment_element( tx_to_genomic_coords=False, @@ -363,12 +354,14 @@ async def from_fusion_catcher( starting_assembly=rb, ) - ce = self._get_causative_event(five_prime[0], three_prime[0], predicted_effect) + ce = self._get_causative_event( + five_prime[0], three_prime[0], fusion_catcher.predicted_effect + ) read_data = ReadData( - split=SplitReads(splitReads=spanning_unique_reads), - spanning=SpanningReads(spanningReads=spanning_reads), + split=SplitReads(splitReads=fusion_catcher.spanning_unique_reads), + spanning=SpanningReads(spanningReads=fusion_catcher.spanning_reads), ) - contig = ContigSequence(contig=fusion_sequence) + contig = ContigSequence(contig=fusion_catcher.fusion_sequence) return self._format_fusion( gene_5prime_element, @@ -440,47 +433,13 @@ async def from_fusion_map( async def from_arriba( self, - gene1: str, - gene2: str, - strand1: str, - strand2: str, - breakpoint1: str, - breakpoint2: str, - event: str, - confidence: str, - direction1: str, - direction2: str, - rf: str, - split_reads1: int, - split_reads2: int, - discordant_mates: int, - coverage1: int, - coverage2: int, - fusion_transcript: str, + arriba: Arriba, coordinate_type: CoordinateType, rb: Assembly, ) -> AssayedFusion: """Parse Arriba output to create AssayedFusion object - :param gene1: The 5' gene fusion partner - :param gene2: The 3' gene fusion partner - :param strand1: The strand information for the 5' gene fusion partner - :param strand2: The strand information for the 3' gene fusion partner - :param breakpoint1: The chromosome and breakpoint for gene1 - :param breakpoint2: The chromosome and breakpoint for gene2 - :param event: An inference about the type of fusion event - :param confidence: A metric describing the confidence of the fusion prediction - :param direction1: A description that indicates if the transcript segment - starts or ends at breakpoint1 - :param direction2: A description that indicates if the transcript segment - starts or ends at breakpoint2 - :param rf: A description if the reading frame is preserved for the fusion - :param split_reads1: Number of supporting split fragments with anchor in gene1 - :param split_reads2: Number of supporting split fragments with anchor in gene2 - :param discordant_mates: Number of discordant mates supporting the fusion - :param coverage1: Number of fragments retained near breakpoint1 - :param coverage2: Number of fragments retained near breakpoint2 - :param fusion_transcript: The assembled fusion transcript + :param arriba: An Arriba class instance :param coordinate_type: If the coordinate is inter-residue or residue :param rb: The reference build used to call the fusion :return: An AssayedFusion object, if construction is successful @@ -488,27 +447,27 @@ async def from_arriba( # Arriba reports two gene symbols if a breakpoint occurs in an intergenic # space. We select the gene symbol with the smallest distance from the # breakpoint. - gene_5prime_element = self._get_gene_element(gene1, "arriba") - gene_3prime_element = self._get_gene_element(gene2, "arriba") + gene_5prime_element = self._get_gene_element(arriba.gene1, "arriba") + gene_3prime_element = self._get_gene_element(arriba.gene2, "arriba") gene_5prime = gene_5prime_element.gene.label gene_3prime = gene_3prime_element.gene.label if not self._are_fusion_partners_different(gene_5prime, gene_3prime): return None - strand1 = strand1.split("/")[1] # Determine strand that is transcribed - strand2 = strand2.split("/")[1] # Determine strand that is transcribed + strand1 = arriba.strand1.split("/")[1] # Determine strand that is transcribed + strand2 = arriba.strand2.split("/")[1] # Determine strand that is transcribed if strand1 == "+": - gene1_seg_start = direction1 == "upstream" + gene1_seg_start = arriba.direction1 == "upstream" else: - gene1_seg_start = direction1 == "downstream" + gene1_seg_start = arriba.direction1 == "downstream" if strand2 == "+": - gene2_seg_start = direction2 == "upstream" + gene2_seg_start = arriba.direction2 == "upstream" else: - gene2_seg_start = direction2 == "downstream" + gene2_seg_start = arriba.direction2 == "downstream" - breakpoint1 = breakpoint1.split(":") - breakpoint2 = breakpoint2.split(":") + breakpoint1 = arriba.breakpoint1.split(":") + breakpoint2 = arriba.breakpoint2.split(":") tr_5prime = await self.fusor.transcript_segment_element( tx_to_genomic_coords=False, @@ -516,8 +475,8 @@ async def from_arriba( seg_start_genomic=int(breakpoint1[1]) if gene1_seg_start else None, seg_end_genomic=int(breakpoint1[1]) if not gene1_seg_start else None, gene=gene_5prime, - coverage=BreakpointCoverage(fragmentCoverage=coverage1), - reads=AnchoredReads(reads=split_reads1), + coverage=BreakpointCoverage(fragmentCoverage=arriba.coverage1), + reads=AnchoredReads(reads=arriba.split_reads1), coordinate_type=coordinate_type, starting_assembly=rb, ) @@ -528,8 +487,8 @@ async def from_arriba( seg_start_genomic=int(breakpoint2[1]) if gene2_seg_start else None, seg_end_genomic=int(breakpoint2[1]) if not gene2_seg_start else None, gene=gene_3prime, - coverage=BreakpointCoverage(fragmentCoverage=coverage2), - reads=AnchoredReads(reads=split_reads2), + coverage=BreakpointCoverage(fragmentCoverage=arriba.coverage2), + reads=AnchoredReads(reads=arriba.split_reads2), coordinate_type=coordinate_type, starting_assembly=rb, ) @@ -537,17 +496,19 @@ async def from_arriba( ce = ( CausativeEvent( eventType=EventType("read-through"), - eventDescription=confidence, + eventDescription=arriba.confidence, ) - if "read_through" in event + if "read_through" in arriba.event else CausativeEvent( eventType=EventType("rearrangement"), - eventDescription=confidence, + eventDescription=arriba.confidence, ) ) - rf = bool(rf == "in-frame") if rf != "." else None - read_data = ReadData(spanning=SpanningReads(spanningReads=discordant_mates)) - contig = ContigSequence(contig=fusion_transcript) + rf = bool(arriba.rf == "in-frame") if arriba.rf != "." else None + read_data = ReadData( + spanning=SpanningReads(spanningReads=arriba.discordant_mates) + ) + contig = ContigSequence(contig=arriba.fusion_transcript) return self._format_fusion( gene_5prime_element, @@ -562,38 +523,13 @@ async def from_arriba( async def from_cicero( self, - gene_5prime: str, - gene_3prime: str, - chr_5prime: str, - chr_3prime: str, - pos_5prime: int, - pos_3prime: int, - sv_ort: str, - event_type: str, - reads_a: int, - reads_b: int, - coverage_a: int, - coverage_b: int, - contig: str, + cicero: Cicero, coordinate_type: CoordinateType, rb: Assembly, ) -> AssayedFusion | str: """Parse CICERO output to create AssayedFusion object - :param gene_5prime: The gene symbol for the 5' partner - :param gene_3prime: The gene symbol for the 3' partner - :param chr_5prime: The chromosome for the 5' partner - :param chr_3prime: The chromosome for the 3' partner - :param pos_5prime: The genomic breakpoint for the 5' partner - :param pos_3prime: The genomic breakpoint for the 3' partner - :param sv_ort: Whether the mapping orientation of assembled contig (driven by - structural variation) has confident biological meaning - :param event_type: The structural variation event that created the called fusion - :param readsA: The number of reads that support the breakpoint for the 5' partner - :param readsB: The number of reads that support the breakpoint for the 3' partner - :param coverageA: The fragment coverage at the 5' breakpoint - :param coverageB: The fragment coverage at the 3' breakpoint - :param contig: The assembled contig sequence for the fusion + :param cicero: A Cicero class instance :param coordinate_type: If the coordinate is inter-residue or residue :param rb: The reference build used to call the fusion :return: An AssayedFusion object, if construction is successful @@ -602,20 +538,20 @@ async def from_cicero( # gene symbols for `gene_5prime` or `gene_3prime`, which are separated by a comma. As # there is not a precise way to resolve this ambiguity, we do not process # these events - if "," in gene_5prime or "," in gene_3prime: + if "," in cicero.gene_5prime or "," in cicero.gene_3prime: msg = "Ambiguous gene symbols are reported by CICERO for at least one of the fusion partners" _logger.warning(msg) return msg # Check CICERO annotation regarding the confidence that the called fusion # has biological meaning - if sv_ort != ">": + if cicero.sv_ort != ">": msg = "CICERO annotation indicates that this event does not have confident biological meaning" _logger.warning(msg) return msg - gene_5prime_element = self._get_gene_element(gene_5prime, "cicero") - gene_3prime_element = self._get_gene_element(gene_3prime, "cicero") + gene_5prime_element = self._get_gene_element(cicero.gene_5prime, "cicero") + gene_3prime_element = self._get_gene_element(cicero.gene_3prime, "cicero") gene_5prime = gene_5prime_element.gene.label gene_3prime = gene_3prime_element.gene.label @@ -624,37 +560,37 @@ async def from_cicero( tr_5prime = await self.fusor.transcript_segment_element( tx_to_genomic_coords=False, - genomic_ac=self._get_genomic_ac(chr_5prime, rb), - seg_end_genomic=pos_5prime, + genomic_ac=self._get_genomic_ac(cicero.chr_5prime, rb), + seg_end_genomic=cicero.pos_5prime, gene=gene_5prime, - coverage=BreakpointCoverage(fragmentCoverage=coverage_a), - reads=AnchoredReads(reads=reads_a), + coverage=BreakpointCoverage(fragmentCoverage=cicero.coverage_5prime), + reads=AnchoredReads(reads=cicero.reads_5prime), coordinate_type=coordinate_type, starting_assembly=rb, ) tr_3prime = await self.fusor.transcript_segment_element( tx_to_genomic_coords=False, - genomic_ac=self._get_genomic_ac(chr_3prime, rb), - seg_start_genomic=pos_3prime, + genomic_ac=self._get_genomic_ac(cicero.chr_3prime, rb), + seg_start_genomic=cicero.pos_3prime, gene=gene_3prime, - coverage=BreakpointCoverage(fragmentCoverage=coverage_b), - reads=AnchoredReads(reads=reads_b), + coverage=BreakpointCoverage(fragmentCoverage=cicero.coverage_3prime), + reads=AnchoredReads(reads=cicero.reads_3prime), coordinate_type=coordinate_type, starting_assembly=rb, ) - if event_type == "read_through": + if cicero.event_type == "read_through": ce = CausativeEvent( eventType=EventType("read-through"), - eventDescription=event_type, + eventDescription=cicero.event_type, ) else: ce = CausativeEvent( eventType=EventType("rearrangement"), - eventDescription=event_type, + eventDescription=cicero.event_type, ) - contig = ContigSequence(contig=contig) + contig = ContigSequence(contig=cicero.contig) return self._format_fusion( gene_5prime_element, @@ -710,29 +646,19 @@ async def from_mapsplice( async def from_enfusion( self, - gene_5prime: str, - gene_3prime: str, - chr_5prime: int, - chr_3prime: int, - break_5prime: int, - break_3prime: int, + enfusion: EnFusion, coordinate_type: CoordinateType, rb: Assembly, ) -> AssayedFusion: """Parse EnFusion output to create AssayedFusion object - :param gene_5prime: The 5' gene fusion partner - :param gene_3prime: The 3' gene fusion partner - :param chr_5prime: The 5' gene fusion partner chromosome - :param chr_3prime: The 3' gene fusion partner chromosome - :param break_5prime: The 5' gene fusion partner genomic breakpoint - :param break_3prime: The 3' gene fusion partner genomic breakpoint - :param rb: The reference build used to call the fusion + :param enfusion: An Enfusion class instance :param coordinate_type: If the coordinate is inter-residue or residue + :param rb: The reference build used to call the fusion :return: An AssayedFusion object, if construction is successful """ - gene_5prime_element = self._get_gene_element(gene_5prime, "enfusion") - gene_3prime_element = self._get_gene_element(gene_3prime, "enfusion") + gene_5prime_element = self._get_gene_element(enfusion.gene_5prime, "enfusion") + gene_3prime_element = self._get_gene_element(enfusion.gene_3prime, "enfusion") gene_5prime = gene_5prime_element.gene.label gene_3prime = gene_3prime_element.gene.label @@ -741,8 +667,8 @@ async def from_enfusion( tr_5prime = await self.fusor.transcript_segment_element( tx_to_genomic_coords=False, - genomic_ac=self._get_genomic_ac(chr_5prime, rb), - seg_end_genomic=break_5prime, + genomic_ac=self._get_genomic_ac(enfusion.chr_5prime, rb), + seg_end_genomic=enfusion.break_5prime, gene=gene_5prime, coordinate_type=coordinate_type, starting_assembly=rb, @@ -750,16 +676,16 @@ async def from_enfusion( tr_3prime = await self.fusor.transcript_segment_element( tx_to_genomic_coords=False, - genomic_ac=self._get_genomic_ac(chr_3prime, rb), - seg_start_genomic=break_3prime, + genomic_ac=self._get_genomic_ac(enfusion.chr_3prime, rb), + seg_start_genomic=enfusion.break_3prime, gene=gene_3prime, coordinate_type=coordinate_type, starting_assembly=rb, ) ce = self._get_causative_event( - chr_5prime, - chr_3prime, + enfusion.chr_5prime, + enfusion.chr_3prime, ) return self._format_fusion( gene_5prime_element, gene_3prime_element, tr_5prime, tr_3prime, ce @@ -767,33 +693,19 @@ async def from_enfusion( async def from_genie( self, - site1_hugo: str, - site2_hugo: str, - site1_chrom: int, - site2_chrom: int, - site1_pos: int, - site2_pos: int, - annot: str, - reading_frame: str, + genie: Genie, coordinate_type: CoordinateType, rb: Assembly, ) -> AssayedFusion: """Parse GENIE output to create AssayedFusion object - :param site1_hugo: The HUGO symbol reported at site 1 - :param site2_hugo: The HUGO symbol reported at site 2 - :param site1_chrom: The chromosome reported at site 1 - :param site2_chrom: The chromosome reported at site 2 - :param site1_pos: The breakpoint reported at site 1 - :param site2_pos: The breakpoint reported at site 2 - :param annot: The annotation for the fusion event - :param reading_frame: The reading frame status of the fusion + :param genie: A Genie class instance :param coordinate_type: If the coordinate is inter-residue or residue :param rb: The reference build used to call the fusion :return: An AssayedFusion object, if construction is successful """ - gene_5prime_element = self._get_gene_element(site1_hugo, "genie") - gene_3prime_element = self._get_gene_element(site2_hugo, "genie") + gene_5prime_element = self._get_gene_element(genie.site1_hugo, "genie") + gene_3prime_element = self._get_gene_element(genie.site2_hugo, "genie") gene_5prime = gene_5prime_element.gene.label gene_3prime = gene_3prime_element.gene.label @@ -802,8 +714,8 @@ async def from_genie( tr_5prime = await self.fusor.transcript_segment_element( tx_to_genomic_coords=False, - genomic_ac=self._get_genomic_ac(site1_chrom, rb), - seg_end_genomic=site1_pos, + genomic_ac=self._get_genomic_ac(genie.site1_chrom, rb), + seg_end_genomic=genie.site1_pos, gene=gene_5prime, coordinate_type=coordinate_type, starting_assembly=rb, @@ -811,19 +723,19 @@ async def from_genie( tr_3prime = await self.fusor.transcript_segment_element( tx_to_genomic_coords=False, - genomic_ac=self._get_genomic_ac(site2_chrom, rb), - seg_start_genomic=site2_pos, + genomic_ac=self._get_genomic_ac(genie.site2_chrom, rb), + seg_start_genomic=genie.site2_pos, gene=gene_3prime, coordinate_type=coordinate_type, starting_assembly=rb, ) ce = self._get_causative_event( - site1_chrom, - site2_chrom, - annot, + genie.site1_chrom, + genie.site2_chrom, + genie.annot, ) - rf = bool(reading_frame == "in frame") + rf = bool(genie.reading_frame == "in frame") return self._format_fusion( gene_5prime, gene_3prime, tr_5prime, tr_3prime, ce, rf ) diff --git a/tests/test_translators.py b/tests/test_translators.py index 1eddcd2..6811a5d 100644 --- a/tests/test_translators.py +++ b/tests/test_translators.py @@ -4,7 +4,16 @@ import pytest from cool_seq_tool.schemas import Assembly, CoordinateType -from fusor.fusion_caller_models import JAFFA, Caller, STARFusion +from fusor.fusion_caller_models import ( + JAFFA, + Arriba, + Caller, + Cicero, + EnFusion, + FusionCatcher, + Genie, + STARFusion, +) from fusor.models import ( AnchoredReads, AssayedFusion, @@ -175,18 +184,8 @@ async def test_jaffa( assert jaffa_fusor.readData == fusion_data_example.readData # Test non-exonic breakpoint - jaffa = JAFFA( - fusion_genes="TPM3:PDGFRB", - chrom1="chr1", - base1=154173079, - chrom2="chr5", - base2=150127173, - rearrangement=True, - classification="HighConfidence", - inframe=True, - spanning_reads=100, - spanning_pairs=80, - ) + jaffa.base1 = 154173079 + jaffa.base2 = 150127173 jaffa_fusor_nonexonic = await translator_instance.from_jaffa( jaffa, @@ -232,15 +231,8 @@ async def test_star_fusion( assert star_fusion_fusor.readData == fusion_data_example.readData # Test non-exonic breakpoints - star_fusion = STARFusion( - left_gene="TPM3^ENSG00000143549.19", - right_gene="PDGFRB^ENSG00000113721", - left_breakpoint="chr1:154173079:-", - right_breakpoint="chr5:150127173:-", - annots='["INTERCHROMOSOMAL]', - junction_read_count=100, - spanning_frag_count=80, - ) + star_fusion.left_breakpoint = "chr1:154173079:-" + star_fusion.right_breakpoint = "chr5:150127173:-" star_fusion_fusor_nonexonic = await translator_instance.from_star_fusion( star_fusion, @@ -266,24 +258,19 @@ async def test_fusion_catcher( ): """Test Fusion Catcher translator""" # Test exonic breakpoint - five_prime_partner = "TPM3" - three_prime_partner = "PDGFRB" - five_prime_fusion_point = "1:154170465:-" - three_prime_fusion_point = "5:150126612:-" - predicted_effect = "exonic(no-known-CDS)/exonic(no-known-CDS)" - spanning_unique_reads = 100 - spanning_reads = 80 - fusion_sequence = "CTAGATGAC*TACTACTA" + fusion_catcher = FusionCatcher( + five_prime_partner="TPM3", + three_prime_partner="PDGFRB", + five_prime_fusion_point="1:154170465:-", + three_prime_fusion_point="5:150126612:-", + predicted_effect="exonic(no-known-CDS)/exonic(no-known-CDS)", + spanning_unique_reads=100, + spanning_reads=80, + fusion_sequence="CTAGATGAC*TACTACTA", + ) fusion_catcher_fusor = await translator_instance.from_fusion_catcher( - five_prime_partner, - three_prime_partner, - five_prime_fusion_point, - three_prime_fusion_point, - predicted_effect, - spanning_unique_reads, - spanning_reads, - fusion_sequence, + fusion_catcher, CoordinateType.INTER_RESIDUE.value, Assembly.GRCH38.value, ) @@ -297,25 +284,11 @@ async def test_fusion_catcher( assert fusion_catcher_fusor.readData == fusion_data_example.readData assert fusion_catcher_fusor.contig == fusion_catcher_fusor.contig - # Test non-exonic breakpoint - five_prime_partner = "TPM3" - three_prime_partner = "PDGFRB" - five_prime_fusion_point = "1:154173079:-" - three_prime_fusion_point = "5:150127173:-" - predicted_effect = "exonic(no-known-CDS)/exonic(no-known-CDS)" - spanning_unique_reads = 100 - spanning_reads = 80 - fusion_sequence = "CTAGATGAC*TACTACTA" + fusion_catcher.five_prime_fusion_point = "1:154173079:-" + fusion_catcher.three_prime_fusion_point = "5:150127173:-" fusion_catcher_fusor_nonexonic = await translator_instance.from_fusion_catcher( - five_prime_partner, - three_prime_partner, - five_prime_fusion_point, - three_prime_fusion_point, - predicted_effect, - spanning_unique_reads, - spanning_reads, - fusion_sequence, + fusion_catcher, CoordinateType.RESIDUE.value, Assembly.GRCH38.value, ) @@ -389,48 +362,34 @@ async def test_arriba( ): """Test Arriba translator""" # Test exonic breakpoint - gene1 = "TPM3" - gene2 = "PDGFRB" - strand1 = "-/-" - strand2 = "-/-" - breakpoint1 = "1:154170465" - breakpoint2 = "5:150126612" - event = "translocation" - confidence = "high" - direction1 = "upstream" - direction2 = "downstream" - rf = "in-frame" - split_reads1 = 100 - split_reads2 = 95 - discordant_mates = 30 - coverage1 = 200 - coverage2 = 190 - fusion_transcript = "CTAGATGAC_TACTACTA|GTACTACT" + arriba = Arriba( + gene1="TPM3", + gene2="PDGFRB", + strand1="-/-", + strand2="-/-", + breakpoint1="1:154170465", + breakpoint2="5:150126612", + event="translocation", + confidence="high", + direction1="upstream", + direction2="downstream", + rf="in-frame", + split_reads1=100, + split_reads2=95, + discordant_mates=30, + coverage1=200, + coverage2=190, + fusion_transcript="CTAGATGAC_TACTACTA|GTACTACT", + ) arriba_fusor = await translator_instance.from_arriba( - gene1, - gene2, - strand1, - strand2, - breakpoint1, - breakpoint2, - event, - confidence, - direction1, - direction2, - rf, - split_reads1, - split_reads2, - discordant_mates, - coverage1, - coverage2, - fusion_transcript, + arriba, CoordinateType.INTER_RESIDUE.value, Assembly.GRCH38.value, ) fusion_data_example = fusion_data_example( readData=ReadData(spanning=SpanningReads(spanningReads=30)), - contig=ContigSequence(contig=fusion_transcript), + contig=ContigSequence(contig=arriba.fusion_transcript), ) fusion_data_example.structure[0].coverage = BreakpointCoverage(fragmentCoverage=200) fusion_data_example.structure[0].anchoredReads = AnchoredReads(reads=100) @@ -441,47 +400,17 @@ async def test_arriba( assert arriba_fusor.contig == fusion_data_example.contig # Test non-exonic breakpoint - gene1 = "TPM3" - gene2 = "PDGFRB" - strand1 = "-/-" - strand2 = "-/-" - breakpoint1 = "1:154173079" - breakpoint2 = "5:150127173" - event = "translocation" - confidence = "high" - direction1 = "upstream" - direction2 = "downstream" - rf = "in-frame" - split_reads1 = 100 - split_reads2 = 95 - discordant_mates = 30 - coverage1 = 200 - coverage2 = 190 + arriba.breakpoint1 = "1:154173079" + arriba.breakpoint2 = "5:150127173" arriba_fusor_nonexonic = await translator_instance.from_arriba( - gene1, - gene2, - strand1, - strand2, - breakpoint1, - breakpoint2, - event, - confidence, - direction1, - direction2, - rf, - split_reads1, - split_reads2, - discordant_mates, - coverage1, - coverage2, - fusion_transcript, + arriba, CoordinateType.RESIDUE.value, Assembly.GRCH38.value, ) fusion_data_example_nonexonic = fusion_data_example_nonexonic( readData=ReadData(spanning=SpanningReads(spanningReads=30)), - contig=ContigSequence(contig=fusion_transcript), + contig=ContigSequence(contig=arriba.fusion_transcript), ) fusion_data_example_nonexonic.structure[0].coverage = BreakpointCoverage( fragmentCoverage=200 @@ -502,38 +431,30 @@ async def test_cicero( ): """Test CICERO translator""" # Test exonic breakpoint - gene_5prime = "TPM3" - gene_3prime = "PDGFRB" - chr_5prime = "1" - chr_3prime = "5" - pos_5prime = 154170465 - pos_3prime = 150126612 - sv_ort = ">" - event_type = "CTX" - reads_a = 100 - reads_b = 90 - coverage_a = 200 - coverage_b = 190 - contig = "ATCATACTAGATACTACTACGATGAGAGAGTACATAGAT" + cicero = Cicero( + gene_5prime="TPM3", + gene_3prime="PDGFRB", + chr_5prime="1", + chr_3prime="5", + pos_5prime=154170465, + pos_3prime=150126612, + sv_ort=">", + event_type="CTX", + reads_5prime=100, + reads_3prime=90, + coverage_5prime=200, + coverage_3prime=190, + contig="ATCATACTAGATACTACTACGATGAGAGAGTACATAGAT", + ) cicero_fusor = await translator_instance.from_cicero( - gene_5prime, - gene_3prime, - chr_5prime, - chr_3prime, - pos_5prime, - pos_3prime, - sv_ort, - event_type, - reads_a, - reads_b, - coverage_a, - coverage_b, - contig, + cicero, CoordinateType.INTER_RESIDUE.value, Assembly.GRCH38.value, ) - fusion_data_example = fusion_data_example(contig=ContigSequence(contig=contig)) + fusion_data_example = fusion_data_example( + contig=ContigSequence(contig=cicero.contig) + ) fusion_data_example.structure[0].coverage = BreakpointCoverage(fragmentCoverage=200) fusion_data_example.structure[0].anchoredReads = AnchoredReads(reads=100) fusion_data_example.structure[1].coverage = BreakpointCoverage(fragmentCoverage=190) @@ -543,39 +464,16 @@ async def test_cicero( assert cicero_fusor.contig == fusion_data_example.contig # Test non-exonic breakpoint - gene_5prime = "TPM3" - gene_3prime = "PDGFRB" - chr_5prime = "1" - chr_3prime = "5" - pos_5prime = 154173079 - pos_3prime = 150127173 - sv_ort = ">" - event_type = "CTX" - reads_a = 100 - reads_b = 90 - coverage_a = 200 - coverage_b = 190 - contig = "ATCATACTAGATACTACTACGATGAGAGAGTACATAGAT" + cicero.pos_5prime = 154173079 + cicero.pos_3prime = 150127173 cicero_fusor_nonexonic = await translator_instance.from_cicero( - gene_5prime, - gene_3prime, - chr_5prime, - chr_3prime, - pos_5prime, - pos_3prime, - sv_ort, - event_type, - reads_a, - reads_b, - coverage_a, - coverage_b, - contig, + cicero, CoordinateType.RESIDUE.value, Assembly.GRCH38.value, ) fusion_data_example_nonexonic = fusion_data_example_nonexonic( - contig=ContigSequence(contig=contig) + contig=ContigSequence(contig=cicero.contig) ) fusion_data_example_nonexonic.structure[0].coverage = BreakpointCoverage( fragmentCoverage=200 @@ -590,34 +488,10 @@ async def test_cicero( assert cicero_fusor_nonexonic.contig == fusion_data_example_nonexonic.contig # Test case where the called fusion does not have confident biological meaning - gene_5prime = "TPM3" - gene_3prime = "PDGFRB" - chr_5prime = "1" - chr_3prime = "5" - pos_5prime = 154173079 - pos_3prime = 150127173 - sv_ort = "?" - event_type = "CTX" - reads_a = 100 - reads_b = 90 - coverage_a = 200 - coverage_b = 190 - contig = "ATCATACTAGATACTACTACGATGAGAGAGTACATAGAT" + cicero.sv_ort = "?" non_confident_bio = await translator_instance.from_cicero( - gene_5prime, - gene_3prime, - chr_5prime, - chr_3prime, - pos_5prime, - pos_3prime, - sv_ort, - event_type, - reads_a, - reads_b, - coverage_a, - coverage_b, - contig, + cicero, CoordinateType.RESIDUE.value, Assembly.GRCH38.value, ) @@ -627,34 +501,10 @@ async def test_cicero( ) # Test case where multiple gene symbols are reported for a fusion partner - gene_5prime = "TPM3" - gene_3prime = "PDGFRB,PDGFRB-FGFR4,FGFR4" - chr_5prime = "1" - chr_3prime = "5" - pos_5prime = 154173079 - pos_3prime = 150127173 - sv_ort = "?" - event_type = "CTX" - reads_a = 100 - reads_b = 90 - coverage_a = 200 - coverage_b = 190 - contig = "ATCATACTAGATACTACTACGATGAGAGAGTACATAGAT" + cicero.gene_3prime = "PDGFRB,PDGFRB-FGFR4,FGFR4" multiple_genes_fusion_partner = await translator_instance.from_cicero( - gene_5prime, - gene_3prime, - chr_5prime, - chr_3prime, - pos_5prime, - pos_3prime, - sv_ort, - event_type, - reads_a, - reads_b, - coverage_a, - coverage_b, - contig, + cicero, CoordinateType.RESIDUE.value, Assembly.GRCH38.value, ) @@ -670,40 +520,28 @@ async def test_enfusion( ): """Test EnFusion translator""" # Test exonic breakpoint - gene_5prime = "TPM3" - gene_3prime = "PDGFRB" - chr_5prime = 1 - chr_3prime = 5 - break_5prime = 154170465 - break_3prime = 150126612 + enfusion = EnFusion( + gene_5prime="TPM3", + gene_3prime="PDGFRB", + chr_5prime=1, + chr_3prime=5, + break_5prime=154170465, + break_3prime=150126612, + ) enfusion_fusor = await translator_instance.from_enfusion( - gene_5prime, - gene_3prime, - chr_5prime, - chr_3prime, - break_5prime, - break_3prime, + enfusion, CoordinateType.INTER_RESIDUE.value, Assembly.GRCH38.value, ) assert enfusion_fusor.structure == fusion_data_example().structure # Test non-exonic breakpoint - gene_5prime = "TPM3" - gene_3prime = "PDGFRB" - chr_5prime = 1 - chr_3prime = 5 - break_5prime = 154173079 - break_3prime = 150127173 + enfusion.break_5prime = 154173079 + enfusion.break_3prime = 150127173 enfusion_fusor_nonexonic = await translator_instance.from_enfusion( - gene_5prime, - gene_3prime, - chr_5prime, - chr_3prime, - break_5prime, - break_3prime, + enfusion, CoordinateType.RESIDUE.value, Assembly.GRCH38.value, ) @@ -718,48 +556,30 @@ async def test_genie( ): """Test GENIE Translator""" # Test exonic breakpoint - site1_hugo = "TPM3" - site2_hugo = "PDGFRB" - site1_chrom = 1 - site2_chrom = 5 - site1_pos = 154170465 - site2_pos = 150126612 - annot = "TMP3 (NM_152263.4) - PDGFRB (NM_002609.4) fusion" - reading_frame = "In_frame" + genie = Genie( + site1_hugo="TPM3", + site2_hugo="PDGFRB", + site1_chrom=1, + site2_chrom=5, + site1_pos=154170465, + site2_pos=150126612, + annot="TMP3 (NM_152263.4) - PDGFRB (NM_002609.4) fusion", + reading_frame="In_frame", + ) genie_fusor = await translator_instance.from_genie( - site1_hugo, - site2_hugo, - site1_chrom, - site2_chrom, - site1_pos, - site2_pos, - annot, - reading_frame, + genie, CoordinateType.INTER_RESIDUE.value, Assembly.GRCH38.value, ) assert genie_fusor.structure == fusion_data_example().structure # Test non-exonic breakpoint - site1_hugo = "TPM3" - site2_hugo = "PDGFRB" - site1_chrom = 1 - site2_chrom = 5 - site1_pos = 154173079 - site2_pos = 150127173 - annot = "TMP3 (NM_152263.4) - PDGFRB (NM_002609.4) fusion" - reading_frame = "In_frame" + genie.site1_pos = 154173079 + genie.site2_pos = 150127173 genie_fusor_nonexonic = await translator_instance.from_genie( - site1_hugo, - site2_hugo, - site1_chrom, - site2_chrom, - site1_pos, - site2_pos, - annot, - reading_frame, + genie, CoordinateType.RESIDUE.value, Assembly.GRCH38.value, ) From ee3b1e43fc0f5b478e5c3d0077d3b6ce465d6211 Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld Date: Thu, 16 Jan 2025 14:55:34 -0500 Subject: [PATCH 07/20] Add small changes to attributes based on jaffa column names --- src/fusor/fusion_caller_models.py | 5 +++-- src/fusor/translator.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/fusor/fusion_caller_models.py b/src/fusor/fusion_caller_models.py index 309b5c4..3b436a9 100644 --- a/src/fusor/fusion_caller_models.py +++ b/src/fusor/fusion_caller_models.py @@ -49,8 +49,9 @@ class JAFFA(BaseModel): classification: str = Field( ..., description="The classification associated with the called fusion" ) - inframe: bool = Field( - ..., description="A boolean indicating if the fusion occurred in-frame" + inframe: bool | str = Field( + ..., + description="A boolean or string indicating if the fusion occurred in-frame", ) spanning_reads: int = Field( ..., diff --git a/src/fusor/translator.py b/src/fusor/translator.py index 86998cb..73a77cb 100644 --- a/src/fusor/translator.py +++ b/src/fusor/translator.py @@ -244,7 +244,7 @@ async def from_jaffa( tr_5prime, tr_3prime, ce, - jaffa.inframe, + jaffa.inframe if isinstance(jaffa.inframe, bool) else None, reads=read_data, ) From 3b718eda4740db3d080347fbc8b1f48bd124bc66 Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld Date: Fri, 17 Jan 2025 11:09:31 -0500 Subject: [PATCH 08/20] Add initial work for extraction methods --- src/fusor/extract.py | 149 ++++++ src/fusor/fusion_caller_models.py | 4 +- tests/conftest.py | 9 + tests/fixtures/annotated.fusion.txt | 2 + .../final-list_candidate-fusion-genes.txt | 356 +++++++++++++ tests/fixtures/fusions_arriba_test.tsv | 2 + tests/fixtures/jaffa_results.csv | 492 ++++++++++++++++++ ...tar-fusion.fusion_predictions.abridged.tsv | 38 ++ tests/test_extractions.py | 66 +++ 9 files changed, 1117 insertions(+), 1 deletion(-) create mode 100644 src/fusor/extract.py create mode 100644 tests/fixtures/annotated.fusion.txt create mode 100644 tests/fixtures/final-list_candidate-fusion-genes.txt create mode 100644 tests/fixtures/fusions_arriba_test.tsv create mode 100644 tests/fixtures/jaffa_results.csv create mode 100644 tests/fixtures/star-fusion.fusion_predictions.abridged.tsv create mode 100644 tests/test_extractions.py diff --git a/src/fusor/extract.py b/src/fusor/extract.py new file mode 100644 index 0000000..c049bf5 --- /dev/null +++ b/src/fusor/extract.py @@ -0,0 +1,149 @@ +"""Module for extracting data from fusion caller output and coverting to pydantic +objects +""" + +import csv +import logging +from pathlib import Path + +from fusor.fusion_caller_models import JAFFA, Arriba, Cicero, FusionCatcher, STARFusion + +_logger = logging.getLogger(__name__) + + +def get_jaffa_records(path: Path) -> list[JAFFA] | None: + """Load fusions from JAFFA csv file + + :param path: The path to the file of JAFFA fusions + :return A list of JAFFA objects, or None if the specified file does not exist + """ + if not path.exists(): + statement = f"{path!s} does not exist" + _logger.error(statement) + return None + fusions_list: list[JAFFA] = [] + column_rename = { + "fusion genes": "fusion_genes", + "spanning reads": "spanning_reads", + "spanning pairs": "spanning_pairs", + } + with path.open() as csvfile: + reader = csv.DictReader(csvfile) + for row in reader: + row = {column_rename.get(key, key): value for key, value in row.items()} + fusions_list.append(JAFFA(**row)) + return fusions_list + + +def get_star_fusion_records(path: Path) -> list[STARFusion] | None: + """Load fusions from STAR-Fusion tsv file + + :param path: The path to the file of STAR-Fusion fusions + :return A list of STAR-Fusion objects, or None if the specified file does not exist + """ + if not path.exists(): + statement = f"{path!s} does not exist" + _logger.error(statement) + return None + fusions_list: list[STARFusion] = [] + column_rename = { + "LeftGene": "left_gene", + "RightGene": "right_gene", + "LeftBreakpoint": "left_breakpoint", + "RightBreakpoint": "right_breakpoint", + "JunctionReadCount": "junction_read_count", + "SpanningFragCount": "spanning_frag_count", + } + with path.open() as csvfile: + reader = csv.DictReader(csvfile, delimiter="\t") + for row in reader: + row = {column_rename.get(key, key): value for key, value in row.items()} + fusions_list.append(STARFusion(**row)) + return fusions_list + + +def get_fusion_catcher_records(path: Path) -> list[FusionCatcher] | None: + """Load fusions from FusionCatcher txt file + + :param path: The path to the file of FusionCatcher fusions + :return A list of FusionCatcher objects, or None if the specified file does not exist + """ + if not path.exists(): + statement = f"{path!s} does not exist" + _logger.error(statement) + return None + fusions_list: list[FusionCatcher] = [] + column_rename = { + "Gene_1_symbol(5end_fusion_partner)": "five_prime_partner", + "Gene_2_symbol(3end_fusion_partner)": "three_prime_partner", + "Fusion_point_for_gene_1(5end_fusion_partner)": "five_prime_fusion_point", + "Fusion_point_for_gene_2(3end_fusion_partner)": "three_prime_fusion_point", + "Predicted_effect": "predicted_effect", + "Spanning_unique_reads": "spanning_unique_reads", + "Spanning_pairs": "spanning_reads", + "Fusion_sequence": "fusion_sequence", + } + with path.open() as csvfile: + reader = csv.DictReader(csvfile, delimiter="\t") + for row in reader: + row = {column_rename.get(key, key): value for key, value in row.items()} + fusions_list.append(FusionCatcher(**row)) + return fusions_list + + +def get_arriba_records(path: Path) -> list[Arriba]: + """Load fusions from Arriba tsv file + + :param path: The path to the file of Arriba fusions + :return A list of Arriba objects, or None if the specified file does not exist + """ + if not path.exists(): + statement = f"{path!s} does not exist" + _logger.error(statement) + return None + fusions_list: list[Arriba] = [] + column_rename = { + "#gene1": "gene1", + "strand1(gene/fusion)": "strand1", + "strand2(gene/fusion)": "strand2", + "type": "event_type", + "reading_frame": "rf", + } + with path.open() as csvfile: + reader = csv.DictReader(csvfile, delimiter="\t") + for row in reader: + row = {column_rename.get(key, key): value for key, value in row.items()} + fusions_list.append(Arriba(**row)) + return fusions_list + + +def get_cicero_records(path: Path) -> list[Cicero]: + """Load fusions from Cicero txt file + + :param path: The path to the file of Cicero fusions + :return A list of Cicero objects, or None if the specified file does not exist + """ + if not path.exists(): + statement = f"{path!s} does not exist" + _logger.error(statement) + return None + fusions_list: list[Cicero] = [] + column_rename = { + "geneA": "gene_5prime", + "geneB": "gene_3prime", + "chrA": "chr_5prime", + "chrB": "chr_3prime", + "posA": "pos_5prime", + "posB": "pos_3prime", + "type": "event_type", + "readsA": "reads_5prime", + "readsB": "reads_3prime", + "coverageA": "coverage_5prime", + "coverageB": "coverage_3prime", + } + with path.open() as csvfile: + reader = csv.DictReader(csvfile, delimiter="\t") + for row in reader: + row = {column_rename.get(key, key): value for key, value in row.items()} + fusions_list.append(Cicero(**row)) + return fusions_list diff --git a/src/fusor/fusion_caller_models.py b/src/fusor/fusion_caller_models.py index 3b436a9..45c56a7 100644 --- a/src/fusor/fusion_caller_models.py +++ b/src/fusor/fusion_caller_models.py @@ -135,7 +135,9 @@ class Arriba(BaseModel): ) breakpoint1: str = Field(..., description="The chromosome and breakpoint for gene1") breakpoint2: str = Field(..., description="The chromosome and breakpoint for gene2") - event: str = Field(..., description=" An inference about the type of fusion event") + event_type: str = Field( + ..., description=" An inference about the type of fusion event" + ) confidence: str = Field( ..., description="A metric describing the confidence of the fusion prediction" ) diff --git a/tests/conftest.py b/tests/conftest.py index ec7f498..390b24f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,6 +2,7 @@ import asyncio import logging +from pathlib import Path import pytest from cool_seq_tool.app import CoolSeqTool @@ -9,6 +10,8 @@ from fusor.fusor import FUSOR from fusor.translator import Translator +FIXTURE_DATA_DIR = Path(__file__).parents[0].resolve() / "fixtures" + def pytest_addoption(parser): """Add custom commands to pytest invocation. @@ -39,6 +42,12 @@ def event_loop(): loop.close() +@pytest.fixture(scope="session") +def fixture_data_dir(): + """Provide test data directory.""" + return FIXTURE_DATA_DIR + + @pytest.fixture(scope="session") def fusor_instance(): """Create test fixture for fusor object diff --git a/tests/fixtures/annotated.fusion.txt b/tests/fixtures/annotated.fusion.txt new file mode 100644 index 0000000..f76562b --- /dev/null +++ b/tests/fixtures/annotated.fusion.txt @@ -0,0 +1,2 @@ +sample geneA chrA posA ortA featureA geneB chrB posB ortB featureB sv_ort readsA readsB matchA matchB repeatA repeatB coverageA coverageB ratioA ratioB qposA qposB total_readsA total_readsB contig type +test TCF3 chr19 1619111 - coding PBX1 chr1 164792493 + coding > 62 78 98 93 0.00 0.00 1145 1756 0.38 0.77 98 97 399 227 CCTCGCAGGCAGCACCAGCCTCATGCACAACCACGCGGCCCTCCCCAGCCAGCCAGGCACCCTCCCTGACCTGTCTCGGCCTCCCGACTCCTACAGTGTTTTGAGTATCCGAGGAGCCCAGGAGGAGGAACCCACAGACCCCCAGCTGATGCGGCTGGACAACATGCTGTTAGCGGAAGGCGTGGCGGGGG CTX diff --git a/tests/fixtures/final-list_candidate-fusion-genes.txt b/tests/fixtures/final-list_candidate-fusion-genes.txt new file mode 100644 index 0000000..d60e82f --- /dev/null +++ b/tests/fixtures/final-list_candidate-fusion-genes.txt @@ -0,0 +1,356 @@ +Gene_1_symbol(5end_fusion_partner) Gene_2_symbol(3end_fusion_partner) Fusion_description Counts_of_common_mapping_reads Spanning_pairs Spanning_unique_reads Longest_anchor_found Fusion_finding_method Fusion_point_for_gene_1(5end_fusion_partner) Fusion_point_for_gene_2(3end_fusion_partner) Gene_1_id(5end_fusion_partner) Gene_2_id(3end_fusion_partner) Exon_1_id(5end_fusion_partner) Exon_2_id(3end_fusion_partner) Fusion_sequence Predicted_effect +ALK EML4 known,similar_reads,oncogene,cosmic,chimerdb2,cgp,ticdb,tcga,cell_lines,ambiguous,18cancers,chimerdb3kb,chimerdb3pub,chimerdb3seq,cancer,tumor,m543,tcga-cancer 39165 545 4 36 BOWTIE+BLAT 2:29223427:- 2:42295181:+ ENSG00000171094 ENSG00000143924 CCCTGAGTACAAGCTGAGCAAGCTCCGCACCTCGACCATCATGACCGACT*AATTACATGCGGTAAATCTCATATTTTCTTCTGGACCTGGAGCGGCAATT out-of-frame +EML4 ALK known,similar_reads,oncogene,cosmic,chimerdb2,cgp,ticdb,tcga,cell_lines,ambiguous,18cancers,chimerdb3kb,chimerdb3pub,chimerdb3seq,cancer,tumor,m543,tcga-cancer,exon-exon 39165 545 27 30 BOWTIE;BOWTIE+STAR 2:42295516:+ 2:29223528:- ENSG00000143924 ENSG00000171094 ENSE00003554806 ENSE00001154407 AGCAAAACTACTGTAGAGCCCACACCTGGGAAAGGACCTAAAG*TGTACCGCCGGAAGCACCAGGAGCTGCAAGCCATGCAGATGGA in-frame +EML4 ALK known,similar_reads,oncogene,cosmic,chimerdb2,cgp,ticdb,tcga,cell_lines,ambiguous,18cancers,chimerdb3kb,chimerdb3pub,chimerdb3seq,cancer,tumor,m543,tcga-cancer 39165 545 2 32 BOWTIE+STAR 2:42235381:+ 2:29790679:- ENSG00000143924 ENSG00000171094 CACCTGTAATCCCAGCTACTCAGGAGGCTAAGGCGCGAGAATTACTTGAG*CCTGGGAGGCAGAGGTTGCAGTGAGCCGAGATCGTGCCATTGCATTCCAG intronic/intronic +EML4 ALK known,similar_reads,oncogene,cosmic,chimerdb2,cgp,ticdb,tcga,cell_lines,ambiguous,18cancers,chimerdb3kb,chimerdb3pub,chimerdb3seq,cancer,tumor,m543,tcga-cancer 39165 545 2 19 BOWTIE+STAR 2:42177812:+ 2:29286943:- ENSG00000143924 ENSG00000171094 TGTGCATGACCTGTTAATGGTTTTTGAAATCAGTTTAGTCGTGGTCAGGA*AAAAAAAAAAAAAAATGAGATGGTGTGTCACAGACTAAGTCAAGGAATAT intronic/intronic +KIF5B RET known,oncogene,cosmic,cgp,ticdb,chimerdb3kb,chimerdb3pub,chimerdb3seq,cancer,tumor,exon-exon 0 505 27 30 BOWTIE;BOWTIE+BLAT;BOWTIE+STAR 10:32017143:- 10:43114480:+ ENSG00000170759 ENSG00000165731 ENSE00001163712 ENSE00001164787 AGGTCAAAGAATATGGCCAGAAGAGGGCATTCTGCACAGATTG*ATCCACTGTGCGACGAGCTGTGCCGCACGGTGATCGCAGCCGC in-frame +RET KIF5B known,oncogene,cosmic,cgp,ticdb,chimerdb3kb,chimerdb3pub,chimerdb3seq,cancer,tumor 0 505 2 33 BOWTIE+BLAT;BOWTIE+STAR 10:43114611:+ 10:32017241:- ENSG00000165731 ENSG00000170759 CTTCTGCATCCACTGCTACCACAAGTTTGCCCACAAGCCACCCATCTCCT*CATCTCGTGATCGCAAACGCTATCAGCAAGAAGTAGATCGCATAAAGGAA in-frame +LMNA NTRK1 known,oncogene,cgp,chimerdb3kb,chimerdb3pub,cancer,tumor,exon-exon 0 476 27 30 BOWTIE;BOWTIE+BLAT;BOWTIE+STAR 1:156130773:+ 1:156874906:+ ENSG00000160789 ENSG00000198400 ENSE00003659692 ENSE00003538368 GCTGGAGGGCGAGCTGCATGATCTGCGGGGCCAGGTGGCCAAG*GTCTCGGTGGCTGTGGGCCTGGCCGTCTTTGCCTGCCTCTTCC in-frame +FGFR3 TACC3 known,adjacent,oncogene,cosmic,ticdb,tcga,cell_lines,18cancers,gliomas,chimerdb3kb,chimerdb3pub,chimerdb3seq,cancer,tcga-cancer,oesophagus,10K Date: Fri, 17 Jan 2025 13:19:32 -0500 Subject: [PATCH 09/20] Add remaining extraction methods, change some attributes --- src/fusor/extract.py | 100 ++++++++++++++++++++++++------ src/fusor/fusion_caller_models.py | 3 + src/fusor/translator.py | 2 +- tests/fixtures/enfusion_test.csv | 2 + tests/fixtures/genie_test.txt | 2 + tests/test_extractions.py | 24 +++++++ tests/test_translators.py | 2 +- 7 files changed, 115 insertions(+), 20 deletions(-) create mode 100644 tests/fixtures/enfusion_test.csv create mode 100644 tests/fixtures/genie_test.txt diff --git a/src/fusor/extract.py b/src/fusor/extract.py index c049bf5..32293c9 100644 --- a/src/fusor/extract.py +++ b/src/fusor/extract.py @@ -6,20 +6,39 @@ import logging from pathlib import Path -from fusor.fusion_caller_models import JAFFA, Arriba, Cicero, FusionCatcher, STARFusion +from fusor.fusion_caller_models import ( + JAFFA, + Arriba, + Cicero, + EnFusion, + FusionCatcher, + Genie, + STARFusion, +) _logger = logging.getLogger(__name__) +def _check_if_file_exists(path: Path) -> bool: + """Check if fusions file exists + + :param path: The path to the file + :return ``True`` if the file exists, ``False`` if not + """ + if not path.exists(): + statement = f"{path!s} does not exist" + _logger.error(statement) + return False + return True + + def get_jaffa_records(path: Path) -> list[JAFFA] | None: """Load fusions from JAFFA csv file :param path: The path to the file of JAFFA fusions :return A list of JAFFA objects, or None if the specified file does not exist """ - if not path.exists(): - statement = f"{path!s} does not exist" - _logger.error(statement) + if not _check_if_file_exists(path): return None fusions_list: list[JAFFA] = [] column_rename = { @@ -41,9 +60,7 @@ def get_star_fusion_records(path: Path) -> list[STARFusion] | None: :param path: The path to the file of STAR-Fusion fusions :return A list of STAR-Fusion objects, or None if the specified file does not exist """ - if not path.exists(): - statement = f"{path!s} does not exist" - _logger.error(statement) + if not _check_if_file_exists(path): return None fusions_list: list[STARFusion] = [] column_rename = { @@ -68,9 +85,7 @@ def get_fusion_catcher_records(path: Path) -> list[FusionCatcher] | None: :param path: The path to the file of FusionCatcher fusions :return A list of FusionCatcher objects, or None if the specified file does not exist """ - if not path.exists(): - statement = f"{path!s} does not exist" - _logger.error(statement) + if not _check_if_file_exists(path): return None fusions_list: list[FusionCatcher] = [] column_rename = { @@ -91,15 +106,13 @@ def get_fusion_catcher_records(path: Path) -> list[FusionCatcher] | None: return fusions_list -def get_arriba_records(path: Path) -> list[Arriba]: +def get_arriba_records(path: Path) -> list[Arriba] | None: """Load fusions from Arriba tsv file :param path: The path to the file of Arriba fusions :return A list of Arriba objects, or None if the specified file does not exist """ - if not path.exists(): - statement = f"{path!s} does not exist" - _logger.error(statement) + if not _check_if_file_exists(path): return None fusions_list: list[Arriba] = [] column_rename = { @@ -117,15 +130,13 @@ def get_arriba_records(path: Path) -> list[Arriba]: return fusions_list -def get_cicero_records(path: Path) -> list[Cicero]: +def get_cicero_records(path: Path) -> list[Cicero] | None: """Load fusions from Cicero txt file :param path: The path to the file of Cicero fusions :return A list of Cicero objects, or None if the specified file does not exist """ - if not path.exists(): - statement = f"{path!s} does not exist" - _logger.error(statement) + if not _check_if_file_exists(path): return None fusions_list: list[Cicero] = [] column_rename = { @@ -147,3 +158,56 @@ def get_cicero_records(path: Path) -> list[Cicero]: row = {column_rename.get(key, key): value for key, value in row.items()} fusions_list.append(Cicero(**row)) return fusions_list + + +def get_enfusion_records(path: Path) -> list[EnFusion] | None: + """Load fusions from EnFusion tsv file + + :param path: The path to the file of Enfusion fusions + :return A list of Enfusion objects, or None if the specified file does not exist + """ + if not _check_if_file_exists(path): + return None + fusions_list: list[EnFusion] = [] + column_rename = { + "Gene1": "gene_5prime", + "Gene2": "gene_3prime", + "Chr1": "chr_5prime", + "Chr2": "chr_3prime", + "Break1": "break_5prime", + "Break2": "break_3prime", + "FusionJunctionSequence": "fusion_junction_sequence", + } + with path.open() as csvfile: + reader = csv.DictReader(csvfile, delimiter="\t") + for row in reader: + row = {column_rename.get(key, key): value for key, value in row.items()} + fusions_list.append(EnFusion(**row)) + return fusions_list + + +def get_genie_records(path: Path) -> list[Genie] | None: + """Load fusions from Genie txt file + + :param path: The path to the file of Genie structural variants + :return A list of Genie objects, or None if the specified file does not exist + """ + if not _check_if_file_exists(path): + return None + fusions_list: list[Genie] = [] + column_rename = { + "Site1_Hugo_Symbol": "site1_hugo", + "Site2_Hugo_Symbol": "site2_hugo", + "Site1_Chromosome": "site1_chrom", + "Site2_Chromosome": "site2_chrom", + "Site1_Position": "site1_pos", + "Site2_Position": "site2_pos", + "Site2_Effect_On_Frame": "reading_frame", + "Annotation": "annot", + } + with path.open() as csvfile: + reader = csv.DictReader(csvfile, delimiter="\t") + for row in reader: + row = {column_rename.get(key, key): value for key, value in row.items()} + fusions_list.append(Genie(**row)) + return fusions_list diff --git a/src/fusor/fusion_caller_models.py b/src/fusor/fusion_caller_models.py index 45c56a7..d3b915c 100644 --- a/src/fusor/fusion_caller_models.py +++ b/src/fusor/fusion_caller_models.py @@ -223,6 +223,9 @@ class EnFusion(BaseModel): break_3prime: int = Field( ..., description="The 3' gene fusion partner genomic breakpoint" ) + fusion_junction_sequence: str | None = Field( + None, description="The sequence near the fusion junction" + ) class Genie(BaseModel): diff --git a/src/fusor/translator.py b/src/fusor/translator.py index 73a77cb..f22c460 100644 --- a/src/fusor/translator.py +++ b/src/fusor/translator.py @@ -498,7 +498,7 @@ async def from_arriba( eventType=EventType("read-through"), eventDescription=arriba.confidence, ) - if "read_through" in arriba.event + if "read_through" in arriba.event_type else CausativeEvent( eventType=EventType("rearrangement"), eventDescription=arriba.confidence, diff --git a/tests/fixtures/enfusion_test.csv b/tests/fixtures/enfusion_test.csv new file mode 100644 index 0000000..eac8911 --- /dev/null +++ b/tests/fixtures/enfusion_test.csv @@ -0,0 +1,2 @@ + Gene1 Gene2 Chr1 Chr2 Break1 Break2 FusionJunctionSequence +0 TPM3 PDGFRB 1 5 154170465 150126612 ATATATCTAGAG|ATAGACAGATAGAT diff --git a/tests/fixtures/genie_test.txt b/tests/fixtures/genie_test.txt new file mode 100644 index 0000000..d5c1b1b --- /dev/null +++ b/tests/fixtures/genie_test.txt @@ -0,0 +1,2 @@ +Site1_Hugo_Symbol Site2_Hugo_Symbol Site1_Chromosome Site2_Chromosome Site1_Position Site2_Position Site2_Effect_On_Frame Annotation +TPM3 PDGFRB 1 5 154170465 150126612 In_frame TMP3 (NM_152263.4) - PDGFRB (NM_002609.4) fusion diff --git a/tests/test_extractions.py b/tests/test_extractions.py index 451e187..99c49c3 100644 --- a/tests/test_extractions.py +++ b/tests/test_extractions.py @@ -5,7 +5,9 @@ from fusor.extract import ( get_arriba_records, get_cicero_records, + get_enfusion_records, get_fusion_catcher_records, + get_genie_records, get_jaffa_records, get_star_fusion_records, ) @@ -64,3 +66,25 @@ def test_get_cicero_records(fixture_data_dir): path = fixture_data_dir / "annnotated.fusion.txt" fusions_list = get_cicero_records(Path(path)) assert fusions_list is None + + +def test_get_enfusion_records(fixture_data_dir): + """Test that get_enfusion_records works correctly""" + path = fixture_data_dir / "enfusion_test.csv" + fusions_list = get_enfusion_records(Path(path)) + assert len(fusions_list) == 1 + + path = fixture_data_dir / "enfusions_test.csv" + fusions_list = get_enfusion_records(Path(path)) + assert fusions_list is None + + +def test_get_genie_records(fixture_data_dir): + """Test that get_genie_records works correctly""" + path = fixture_data_dir / "genie_test.txt" + fusions_list = get_genie_records(Path(path)) + assert len(fusions_list) == 1 + + path = fixture_data_dir / "genie_tests.txt" + fusions_list = get_genie_records(Path(path)) + assert fusions_list is None diff --git a/tests/test_translators.py b/tests/test_translators.py index 6811a5d..1ca6d9d 100644 --- a/tests/test_translators.py +++ b/tests/test_translators.py @@ -369,7 +369,7 @@ async def test_arriba( strand2="-/-", breakpoint1="1:154170465", breakpoint2="5:150126612", - event="translocation", + event_type="translocation", confidence="high", direction1="upstream", direction2="downstream", From ff8df520cdb1963c07fee283fe6cfa33f649693c Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld Date: Thu, 23 Jan 2025 10:15:31 -0500 Subject: [PATCH 10/20] Allow extra parameters to be supplied --- src/fusor/fusion_caller_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fusor/fusion_caller_models.py b/src/fusor/fusion_caller_models.py index 6953799..9c8d168 100644 --- a/src/fusor/fusion_caller_models.py +++ b/src/fusor/fusion_caller_models.py @@ -21,7 +21,7 @@ class Caller(str, Enum): GENIE = "GENIE" -class FusionCaller(ABC, BaseModel, extra="forbid"): +class FusionCaller(ABC, BaseModel): """ABC for fusion callers""" type: Caller From 4fb55c603f0c16e6f78fbd5c9d516ecf7ca6fa29 Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld Date: Thu, 23 Jan 2025 15:11:54 -0500 Subject: [PATCH 11/20] Refactor extraction methods --- src/fusor/extract.py | 109 +++++++++++++------------------------- tests/test_extractions.py | 30 ++++++----- 2 files changed, 54 insertions(+), 85 deletions(-) diff --git a/src/fusor/extract.py b/src/fusor/extract.py index 32293c9..bb707b1 100644 --- a/src/fusor/extract.py +++ b/src/fusor/extract.py @@ -3,7 +3,6 @@ """ import csv -import logging from pathlib import Path from fusor.fusion_caller_models import ( @@ -11,15 +10,14 @@ Arriba, Cicero, EnFusion, + FusionCaller, FusionCatcher, Genie, STARFusion, ) -_logger = logging.getLogger(__name__) - -def _check_if_file_exists(path: Path) -> bool: +def _does_file_exist(path: Path) -> None: """Check if fusions file exists :param path: The path to the file @@ -27,31 +25,42 @@ def _check_if_file_exists(path: Path) -> bool: """ if not path.exists(): statement = f"{path!s} does not exist" - _logger.error(statement) - return False - return True + raise ValueError(statement) + return + + +def _process_fusion_caller_rows( + path: Path, caller: FusionCaller, column_rename: dict +) -> list[FusionCaller]: + """Convert rows of fusion caller output to Pydantic classes + + :param path: The path to the fusions file + :param caller: The name of the fusion caller + :param column_rename: A dictionary of column mappings + :return: A list of fusions, represented as Pydantic objects + """ + fusions_list = [] + with path.open() as csvfile: + reader = csv.DictReader(csvfile, delimiter="," if caller == JAFFA else "\t") + for row in reader: + row = {column_rename.get(key, key): value for key, value in row.items()} + fusions_list.append(caller(**row)) + return fusions_list -def get_jaffa_records(path: Path) -> list[JAFFA] | None: +def get_jaffa_records(path: Path) -> list[JAFFA]: """Load fusions from JAFFA csv file :param path: The path to the file of JAFFA fusions :return A list of JAFFA objects, or None if the specified file does not exist """ - if not _check_if_file_exists(path): - return None - fusions_list: list[JAFFA] = [] + _does_file_exist(path) column_rename = { "fusion genes": "fusion_genes", "spanning reads": "spanning_reads", "spanning pairs": "spanning_pairs", } - with path.open() as csvfile: - reader = csv.DictReader(csvfile) - for row in reader: - row = {column_rename.get(key, key): value for key, value in row.items()} - fusions_list.append(JAFFA(**row)) - return fusions_list + return _process_fusion_caller_rows(path, JAFFA, column_rename) def get_star_fusion_records(path: Path) -> list[STARFusion] | None: @@ -60,9 +69,7 @@ def get_star_fusion_records(path: Path) -> list[STARFusion] | None: :param path: The path to the file of STAR-Fusion fusions :return A list of STAR-Fusion objects, or None if the specified file does not exist """ - if not _check_if_file_exists(path): - return None - fusions_list: list[STARFusion] = [] + _does_file_exist(path) column_rename = { "LeftGene": "left_gene", "RightGene": "right_gene", @@ -71,12 +78,7 @@ def get_star_fusion_records(path: Path) -> list[STARFusion] | None: "JunctionReadCount": "junction_read_count", "SpanningFragCount": "spanning_frag_count", } - with path.open() as csvfile: - reader = csv.DictReader(csvfile, delimiter="\t") - for row in reader: - row = {column_rename.get(key, key): value for key, value in row.items()} - fusions_list.append(STARFusion(**row)) - return fusions_list + return _process_fusion_caller_rows(path, STARFusion, column_rename) def get_fusion_catcher_records(path: Path) -> list[FusionCatcher] | None: @@ -85,9 +87,7 @@ def get_fusion_catcher_records(path: Path) -> list[FusionCatcher] | None: :param path: The path to the file of FusionCatcher fusions :return A list of FusionCatcher objects, or None if the specified file does not exist """ - if not _check_if_file_exists(path): - return None - fusions_list: list[FusionCatcher] = [] + _does_file_exist(path) column_rename = { "Gene_1_symbol(5end_fusion_partner)": "five_prime_partner", "Gene_2_symbol(3end_fusion_partner)": "three_prime_partner", @@ -98,12 +98,7 @@ def get_fusion_catcher_records(path: Path) -> list[FusionCatcher] | None: "Spanning_pairs": "spanning_reads", "Fusion_sequence": "fusion_sequence", } - with path.open() as csvfile: - reader = csv.DictReader(csvfile, delimiter="\t") - for row in reader: - row = {column_rename.get(key, key): value for key, value in row.items()} - fusions_list.append(FusionCatcher(**row)) - return fusions_list + return _process_fusion_caller_rows(path, FusionCatcher, column_rename) def get_arriba_records(path: Path) -> list[Arriba] | None: @@ -112,9 +107,7 @@ def get_arriba_records(path: Path) -> list[Arriba] | None: :param path: The path to the file of Arriba fusions :return A list of Arriba objects, or None if the specified file does not exist """ - if not _check_if_file_exists(path): - return None - fusions_list: list[Arriba] = [] + _does_file_exist(path) column_rename = { "#gene1": "gene1", "strand1(gene/fusion)": "strand1", @@ -122,12 +115,7 @@ def get_arriba_records(path: Path) -> list[Arriba] | None: "type": "event_type", "reading_frame": "rf", } - with path.open() as csvfile: - reader = csv.DictReader(csvfile, delimiter="\t") - for row in reader: - row = {column_rename.get(key, key): value for key, value in row.items()} - fusions_list.append(Arriba(**row)) - return fusions_list + return _process_fusion_caller_rows(path, Arriba, column_rename) def get_cicero_records(path: Path) -> list[Cicero] | None: @@ -136,9 +124,7 @@ def get_cicero_records(path: Path) -> list[Cicero] | None: :param path: The path to the file of Cicero fusions :return A list of Cicero objects, or None if the specified file does not exist """ - if not _check_if_file_exists(path): - return None - fusions_list: list[Cicero] = [] + _does_file_exist(path) column_rename = { "geneA": "gene_5prime", "geneB": "gene_3prime", @@ -152,12 +138,7 @@ def get_cicero_records(path: Path) -> list[Cicero] | None: "coverageA": "coverage_5prime", "coverageB": "coverage_3prime", } - with path.open() as csvfile: - reader = csv.DictReader(csvfile, delimiter="\t") - for row in reader: - row = {column_rename.get(key, key): value for key, value in row.items()} - fusions_list.append(Cicero(**row)) - return fusions_list + return _process_fusion_caller_rows(path, Cicero, column_rename) def get_enfusion_records(path: Path) -> list[EnFusion] | None: @@ -166,9 +147,7 @@ def get_enfusion_records(path: Path) -> list[EnFusion] | None: :param path: The path to the file of Enfusion fusions :return A list of Enfusion objects, or None if the specified file does not exist """ - if not _check_if_file_exists(path): - return None - fusions_list: list[EnFusion] = [] + _does_file_exist(path) column_rename = { "Gene1": "gene_5prime", "Gene2": "gene_3prime", @@ -178,12 +157,7 @@ def get_enfusion_records(path: Path) -> list[EnFusion] | None: "Break2": "break_3prime", "FusionJunctionSequence": "fusion_junction_sequence", } - with path.open() as csvfile: - reader = csv.DictReader(csvfile, delimiter="\t") - for row in reader: - row = {column_rename.get(key, key): value for key, value in row.items()} - fusions_list.append(EnFusion(**row)) - return fusions_list + return _process_fusion_caller_rows(path, EnFusion, column_rename) def get_genie_records(path: Path) -> list[Genie] | None: @@ -192,9 +166,7 @@ def get_genie_records(path: Path) -> list[Genie] | None: :param path: The path to the file of Genie structural variants :return A list of Genie objects, or None if the specified file does not exist """ - if not _check_if_file_exists(path): - return None - fusions_list: list[Genie] = [] + _does_file_exist(path) column_rename = { "Site1_Hugo_Symbol": "site1_hugo", "Site2_Hugo_Symbol": "site2_hugo", @@ -205,9 +177,4 @@ def get_genie_records(path: Path) -> list[Genie] | None: "Site2_Effect_On_Frame": "reading_frame", "Annotation": "annot", } - with path.open() as csvfile: - reader = csv.DictReader(csvfile, delimiter="\t") - for row in reader: - row = {column_rename.get(key, key): value for key, value in row.items()} - fusions_list.append(Genie(**row)) - return fusions_list + return _process_fusion_caller_rows(path, Genie, column_rename) diff --git a/tests/test_extractions.py b/tests/test_extractions.py index 99c49c3..f9deb8b 100644 --- a/tests/test_extractions.py +++ b/tests/test_extractions.py @@ -2,6 +2,8 @@ from pathlib import Path +import pytest + from fusor.extract import ( get_arriba_records, get_cicero_records, @@ -20,8 +22,8 @@ def test_get_jaffa_records(fixture_data_dir): assert len(fusions_list) == 491 path = fixture_data_dir / "jaffa_resultss.csv" - fusions_list = get_jaffa_records(Path(path)) - assert fusions_list is None + with pytest.raises(ValueError, match=f"{path} does not exist"): + assert get_jaffa_records(path) def test_get_star_fusion_records(fixture_data_dir): @@ -31,8 +33,8 @@ def test_get_star_fusion_records(fixture_data_dir): assert len(fusions_list) == 37 path = fixture_data_dir / "star-fusion.fusion_predictions.abridged.tsvs" - fusions_list = get_star_fusion_records(Path(path)) - assert fusions_list is None + with pytest.raises(ValueError, match=f"{path} does not exist"): + assert get_jaffa_records(path) def test_get_fusion_catcher_records(fixture_data_dir): @@ -42,8 +44,8 @@ def test_get_fusion_catcher_records(fixture_data_dir): assert len(fusions_list) == 355 path = fixture_data_dir / "final-list_candidate-fusion-genes.txts" - fusions_list = get_fusion_catcher_records(Path(path)) - assert fusions_list is None + with pytest.raises(ValueError, match=f"{path} does not exist"): + assert get_jaffa_records(path) def test_get_arriba_records(fixture_data_dir): @@ -53,8 +55,8 @@ def test_get_arriba_records(fixture_data_dir): assert len(fusions_list) == 1 path = fixture_data_dir / "fusionsd_arriba_test.tsv" - fusions_list = get_arriba_records(Path(path)) - assert fusions_list is None + with pytest.raises(ValueError, match=f"{path} does not exist"): + assert get_jaffa_records(path) def test_get_cicero_records(fixture_data_dir): @@ -64,8 +66,8 @@ def test_get_cicero_records(fixture_data_dir): assert len(fusions_list) == 1 path = fixture_data_dir / "annnotated.fusion.txt" - fusions_list = get_cicero_records(Path(path)) - assert fusions_list is None + with pytest.raises(ValueError, match=f"{path} does not exist"): + assert get_jaffa_records(path) def test_get_enfusion_records(fixture_data_dir): @@ -75,8 +77,8 @@ def test_get_enfusion_records(fixture_data_dir): assert len(fusions_list) == 1 path = fixture_data_dir / "enfusions_test.csv" - fusions_list = get_enfusion_records(Path(path)) - assert fusions_list is None + with pytest.raises(ValueError, match=f"{path} does not exist"): + assert get_jaffa_records(path) def test_get_genie_records(fixture_data_dir): @@ -86,5 +88,5 @@ def test_get_genie_records(fixture_data_dir): assert len(fusions_list) == 1 path = fixture_data_dir / "genie_tests.txt" - fusions_list = get_genie_records(Path(path)) - assert fusions_list is None + with pytest.raises(ValueError, match=f"{path} does not exist"): + assert get_jaffa_records(path) From 7da8e76cffb4c8782464a5a7feb8f0497094cec4 Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld Date: Thu, 23 Jan 2025 17:02:44 -0500 Subject: [PATCH 12/20] Store work --- src/fusor/extract.py | 11 +- src/fusor/fusion_caller_models.py | 299 +++++++++++++++++++++++------- tests/test_extractions.py | 20 +- 3 files changed, 251 insertions(+), 79 deletions(-) diff --git a/src/fusor/extract.py b/src/fusor/extract.py index bb707b1..4418845 100644 --- a/src/fusor/extract.py +++ b/src/fusor/extract.py @@ -21,7 +21,8 @@ def _does_file_exist(path: Path) -> None: """Check if fusions file exists :param path: The path to the file - :return ``True`` if the file exists, ``False`` if not + :return None + :raise ValueError if the file does not exist at the specified path """ if not path.exists(): statement = f"{path!s} does not exist" @@ -39,6 +40,7 @@ def _process_fusion_caller_rows( :param column_rename: A dictionary of column mappings :return: A list of fusions, represented as Pydantic objects """ + _does_file_exist(path) fusions_list = [] with path.open() as csvfile: reader = csv.DictReader(csvfile, delimiter="," if caller == JAFFA else "\t") @@ -54,7 +56,6 @@ def get_jaffa_records(path: Path) -> list[JAFFA]: :param path: The path to the file of JAFFA fusions :return A list of JAFFA objects, or None if the specified file does not exist """ - _does_file_exist(path) column_rename = { "fusion genes": "fusion_genes", "spanning reads": "spanning_reads", @@ -69,7 +70,6 @@ def get_star_fusion_records(path: Path) -> list[STARFusion] | None: :param path: The path to the file of STAR-Fusion fusions :return A list of STAR-Fusion objects, or None if the specified file does not exist """ - _does_file_exist(path) column_rename = { "LeftGene": "left_gene", "RightGene": "right_gene", @@ -87,7 +87,6 @@ def get_fusion_catcher_records(path: Path) -> list[FusionCatcher] | None: :param path: The path to the file of FusionCatcher fusions :return A list of FusionCatcher objects, or None if the specified file does not exist """ - _does_file_exist(path) column_rename = { "Gene_1_symbol(5end_fusion_partner)": "five_prime_partner", "Gene_2_symbol(3end_fusion_partner)": "three_prime_partner", @@ -107,7 +106,6 @@ def get_arriba_records(path: Path) -> list[Arriba] | None: :param path: The path to the file of Arriba fusions :return A list of Arriba objects, or None if the specified file does not exist """ - _does_file_exist(path) column_rename = { "#gene1": "gene1", "strand1(gene/fusion)": "strand1", @@ -124,7 +122,6 @@ def get_cicero_records(path: Path) -> list[Cicero] | None: :param path: The path to the file of Cicero fusions :return A list of Cicero objects, or None if the specified file does not exist """ - _does_file_exist(path) column_rename = { "geneA": "gene_5prime", "geneB": "gene_3prime", @@ -147,7 +144,6 @@ def get_enfusion_records(path: Path) -> list[EnFusion] | None: :param path: The path to the file of Enfusion fusions :return A list of Enfusion objects, or None if the specified file does not exist """ - _does_file_exist(path) column_rename = { "Gene1": "gene_5prime", "Gene2": "gene_3prime", @@ -166,7 +162,6 @@ def get_genie_records(path: Path) -> list[Genie] | None: :param path: The path to the file of Genie structural variants :return A list of Genie objects, or None if the specified file does not exist """ - _does_file_exist(path) column_rename = { "Site1_Hugo_Symbol": "site1_hugo", "Site2_Hugo_Symbol": "site2_hugo", diff --git a/src/fusor/fusion_caller_models.py b/src/fusor/fusion_caller_models.py index 9c8d168..3afc903 100644 --- a/src/fusor/fusion_caller_models.py +++ b/src/fusor/fusion_caller_models.py @@ -1,7 +1,9 @@ """Schemas for outputs provided by different fusion callers""" -from abc import ABC +import csv +from abc import ABC, abstractmethod from enum import Enum +from pathlib import Path from typing import Literal from pydantic import BaseModel, Field @@ -26,45 +28,100 @@ class FusionCaller(ABC, BaseModel): type: Caller + class Config: + """Allow extra fields from fusion callers to be provided""" + + extra = "allow" + + def _does_file_exist(self, path: Path) -> None: + """Check if fusions file exists + + :param path: The path to the file + :return None + :raise ValueError if the file does not exist at the specified path + """ + if not path.exists(): + statement = f"{path!s} does not exist" + raise ValueError(statement) + return + + def _process_fusion_caller_rows( + self, path: Path, caller: Caller, column_rename: dict + ) -> list[Caller]: + """Convert rows of fusion caller output to Pydantic classes + + :param path: The path to the fusions file + :param caller: The name of the fusion caller + :param column_rename: A dictionary of column mappings + :return: A list of fusions, represented as Pydantic objects + """ + self._does_file_exist(path) + fusions_list = [] + with path.open() as csvfile: + reader = csv.DictReader( + csvfile, delimiter="," if caller == Caller.JAFFA else "\t" + ) + for row in reader: + row = {column_rename.get(key, key): value for key, value in row.items()} + fusions_list.append(caller(**row)) + return fusions_list + + @abstractmethod + def load_records(self, path: Path) -> list[Caller]: + """Abstract method to load records from a fusion caller file.""" + class JAFFA(FusionCaller): """Define parameters for JAFFA model""" type: Literal[Caller.JAFFA] = Caller.JAFFA fusion_genes: str = Field( - ..., description="A string containing the two fusion partners" + None, description="A string containing the two fusion partners" ) chrom1: str = Field( - ..., description="The chromosome indicated in the chrom1 column" + None, description="The chromosome indicated in the chrom1 column" ) base1: int = Field( - ..., description="The genomic position indicated in the base1 column" + None, description="The genomic position indicated in the base1 column" ) chrom2: str = Field( - ..., description="The chromosome indicated in the chrom2 column" + None, description="The chromosome indicated in the chrom2 column" ) base2: int = Field( - ..., description="The genomic position indicated in the base2 column" + None, description="The genomic position indicated in the base2 column" ) rearrangement: bool = Field( - ..., description=" A boolean indicating if a rearrangement occurred" + None, description=" A boolean indicating if a rearrangement occurred" ) classification: str = Field( - ..., description="The classification associated with the called fusion" + None, description="The classification associated with the called fusion" ) inframe: bool | str = Field( - ..., + None, description="A boolean or string indicating if the fusion occurred in-frame", ) spanning_reads: int = Field( - ..., + None, description="The number of detected reads that span the junction between the two transcript. Although described as spanning reads, this aligns with our definition of split reads i.e. reads that have sequence belonging to the two fusion partners", ) spanning_pairs: int = Field( - ..., + None, description="The number of detected reads that align entirely on either side of the breakpoint", ) + def load_records(self, path: Path) -> list["JAFFA"]: + """Load fusions from JAFFA csv file + + :param path: The path to the file of JAFFA fusions + :return A list of JAFFA objects, or None if the specified file does not exist + """ + column_rename = { + "fusion genes": "fusion_genes", + "spanning reads": "spanning_reads", + "spanning pairs": "spanning_pairs", + } + return self._process_fusion_caller_rows(path, JAFFA, column_rename) + class STARFusion(BaseModel): """Define parameters for STAR-Fusion model""" @@ -72,57 +129,91 @@ class STARFusion(BaseModel): type: Literal[Caller.STAR_FUSION] = Caller.STAR_FUSION left_gene: str = Field(..., description="The gene indicated in the LeftGene column") right_gene: str = Field( - ..., description="The gene indicated in the RightGene column" + None, description="The gene indicated in the RightGene column" ) left_breakpoint: str = Field( - ..., description="The gene indicated in the LeftBreakpoint column" + None, description="The gene indicated in the LeftBreakpoint column" ) right_breakpoint: str = Field( - ..., description="The gene indicated in the RightBreakpoint column" + None, description="The gene indicated in the RightBreakpoint column" ) - annots: str = Field(..., description="The annotations associated with the fusion") + annots: str = Field(None, description="The annotations associated with the fusion") junction_read_count: int = Field( - ..., + None, description="The number of RNA-seq fragments that split the junction between the two transcript segments (from STAR-Fusion documentation)", ) spanning_frag_count: int = Field( - ..., + None, description="The number of RNA-seq fragments that encompass the fusion junction such that one read of the pair aligns to a different gene than the other paired-end read of that fragment (from STAR-Fusion documentation)", ) + def load_records(self, path: Path) -> list["STARFusion"]: + """Load fusions from STAR-Fusion tsv file + + :param path: The path to the file of STAR-Fusion fusions + :return A list of STAR-Fusion objects, or None if the specified file does not exist + """ + column_rename = { + "LeftGene": "left_gene", + "RightGene": "right_gene", + "LeftBreakpoint": "left_breakpoint", + "RightBreakpoint": "right_breakpoint", + "JunctionReadCount": "junction_read_count", + "SpanningFragCount": "spanning_frag_count", + } + return self._process_fusion_caller_rows(path, STARFusion, column_rename) + class FusionCatcher(BaseModel): """Define parameters for FusionCatcher model""" type: Literal[Caller.FUSION_CATCHER] = Caller.FUSION_CATCHER five_prime_partner: str = Field( - ..., description="Gene symbol for the 5' fusion partner" + None, description="Gene symbol for the 5' fusion partner" ) three_prime_partner: str = Field( - ..., description="Gene symbol for the 3' fusion partner" + None, description="Gene symbol for the 3' fusion partner" ) five_prime_fusion_point: str = Field( - ..., + None, description="Chromosomal position for the 5' end of the fusion junction. This coordinate is 1-based", ) three_prime_fusion_point: str = Field( - ..., + None, description="Chromosomal position for the 3' end of the fusion junction. This coordinate is 1-based", ) predicted_effect: str = Field( - ..., + None, description="The predicted effect of the fusion event, created using annotation from the Ensembl database", ) spanning_unique_reads: int = Field( - ..., description="The number of unique reads that map on the fusion junction" + None, description="The number of unique reads that map on the fusion junction" ) spanning_reads: int = Field( - ..., description="The number of paired reads that support the fusion" + None, description="The number of paired reads that support the fusion" ) fusion_sequence: str = Field( - ..., description="The inferred sequence around the fusion junction" + None, description="The inferred sequence around the fusion junction" ) + def load_records(self, path: Path) -> list["FusionCatcher"]: + """Load fusions from FusionCatcher txt file + + :param path: The path to the file of FusionCatcher fusions + :return A list of FusionCatcher objects, or None if the specified file does not exist + """ + column_rename = { + "Gene_1_symbol(5end_fusion_partner)": "five_prime_partner", + "Gene_2_symbol(3end_fusion_partner)": "three_prime_partner", + "Fusion_point_for_gene_1(5end_fusion_partner)": "five_prime_fusion_point", + "Fusion_point_for_gene_2(3end_fusion_partner)": "three_prime_fusion_point", + "Predicted_effect": "predicted_effect", + "Spanning_unique_reads": "spanning_unique_reads", + "Spanning_pairs": "spanning_reads", + "Fusion_sequence": "fusion_sequence", + } + return self._process_fusion_caller_rows(path, FusionCatcher, column_rename) + class Arriba(BaseModel): """Define parameters for Arriba model""" @@ -131,117 +222,193 @@ class Arriba(BaseModel): gene1: str = Field(..., description="The 5' gene fusion partner") gene2: str = Field(..., description="The 3' gene fusion partner") strand1: str = Field( - ..., description="The strand information for the 5' gene fusion partner" + None, description="The strand information for the 5' gene fusion partner" ) strand2: str = Field( - ..., description="The strand information for the 3' gene fusion partner" + None, description="The strand information for the 3' gene fusion partner" + ) + breakpoint1: str = Field( + None, description="The chromosome and breakpoint for gene1" + ) + breakpoint2: str = Field( + None, description="The chromosome and breakpoint for gene2" ) - breakpoint1: str = Field(..., description="The chromosome and breakpoint for gene1") - breakpoint2: str = Field(..., description="The chromosome and breakpoint for gene2") event_type: str = Field( - ..., description=" An inference about the type of fusion event" + None, description=" An inference about the type of fusion event" ) confidence: str = Field( - ..., description="A metric describing the confidence of the fusion prediction" + None, description="A metric describing the confidence of the fusion prediction" ) direction1: str = Field( - ..., + None, description="A description that indicates if the transcript segment starts or ends at breakpoint1", ) direction2: str = Field( - ..., + None, description="A description that indicates if the transcript segment starts or ends at breakpoint2", ) rf: str = Field( - ..., + None, description="A description if the reading frame is preserved for the fusion", ) split_reads1: int = Field( - ..., description="Number of supporting split fragments with anchor in gene1" + None, description="Number of supporting split fragments with anchor in gene1" ) split_reads2: int = Field( - ..., description="Number of supporting split fragments with anchor in gene2" + None, description="Number of supporting split fragments with anchor in gene2" ) discordant_mates: int = Field( - ..., description="Number of discordant mates supporting the fusion" + None, description="Number of discordant mates supporting the fusion" ) coverage1: int = Field( - ..., description="Number of fragments retained near breakpoint1" + None, description="Number of fragments retained near breakpoint1" ) coverage2: int = Field( - ..., description="Number of fragments retained near breakpoint2" + None, description="Number of fragments retained near breakpoint2" ) - fusion_transcript: str = Field(..., description="The assembled fusion transcript") + fusion_transcript: str = Field(None, description="The assembled fusion transcript") + + def load_records(self, path: Path) -> list["Arriba"]: + """Load fusions from Arriba tsv file + + :param path: The path to the file of Arriba fusions + :return A list of Arriba objects, or None if the specified file does not exist + """ + column_rename = { + "#gene1": "gene1", + "strand1(gene/fusion)": "strand1", + "strand2(gene/fusion)": "strand2", + "type": "event_type", + "reading_frame": "rf", + } + return self._process_fusion_caller_rows(path, Arriba, column_rename) class Cicero(BaseModel): """Define parameters for CICERO model""" type: Literal[Caller.CICERO] = Caller.CICERO - gene_5prime: str = Field(..., description="The gene symbol for the 5' partner") - gene_3prime: str = Field(..., description="The gene symbol for the 3' partner") - chr_5prime: str = Field(..., description="The chromosome for the 5' partner") - chr_3prime: str = Field(..., description="The chromosome for the 3' partner") + gene_5prime: str = Field(None, description="The gene symbol for the 5' partner") + gene_3prime: str = Field(None, description="The gene symbol for the 3' partner") + chr_5prime: str = Field(None, description="The chromosome for the 5' partner") + chr_3prime: str = Field(None, description="The chromosome for the 3' partner") pos_5prime: int = Field( - ..., description="The genomic breakpoint for the 5' partner" + None, description="The genomic breakpoint for the 5' partner" ) pos_3prime: int = Field( - ..., description="The genomic breakpoint for the 3' partner" + None, description="The genomic breakpoint for the 3' partner" ) sv_ort: str = Field( - ..., + None, description="Whether the mapping orientation of assembled contig (driven by structural variation) has confident biological meaning", ) event_type: str = Field( - ..., description="The structural variation event that created the called fusion" + None, + description="The structural variation event that created the called fusion", ) reads_5prime: int = Field( - ..., + None, description="The number of reads that support the breakpoint for the 5' partner", ) reads_3prime: int = Field( - ..., + None, description="The number of reads that support the breakpoint for the 3' partner", ) coverage_5prime: int = Field( - ..., description="The fragment coverage at the 5' breakpoint" + None, description="The fragment coverage at the 5' breakpoint" ) coverage_3prime: int = Field( - ..., description="The fragment coverage at the 3' breakpoint" + None, description="The fragment coverage at the 3' breakpoint" ) contig: str = Field(..., description="The assembled contig sequence for the fusion") + def load_records(self, path: Path) -> list["Cicero"]: + """Load fusions from Cicero txt file + + :param path: The path to the file of Cicero fusions + :return A list of Cicero objects, or None if the specified file does not exist + """ + column_rename = { + "geneA": "gene_5prime", + "geneB": "gene_3prime", + "chrA": "chr_5prime", + "chrB": "chr_3prime", + "posA": "pos_5prime", + "posB": "pos_3prime", + "type": "event_type", + "readsA": "reads_5prime", + "readsB": "reads_3prime", + "coverageA": "coverage_5prime", + "coverageB": "coverage_3prime", + } + return self._process_fusion_caller_rows(path, Cicero, column_rename) + class EnFusion(BaseModel): """Define parameters for EnFusion model""" type: Literal[Caller.ENFUSION] = Caller.ENFUSION - gene_5prime: str = Field(..., description="The 5' gene fusion partner") - gene_3prime: str = Field(..., description="The 3' gene fusion partner") - chr_5prime: int = Field(..., description="The 5' gene fusion partner chromosome") - chr_3prime: int = Field(..., description="The 3' gene fusion partner chromosome") + gene_5prime: str = Field(None, description="The 5' gene fusion partner") + gene_3prime: str = Field(None, description="The 3' gene fusion partner") + chr_5prime: int = Field(None, description="The 5' gene fusion partner chromosome") + chr_3prime: int = Field(None, description="The 3' gene fusion partner chromosome") break_5prime: int = Field( - ..., description="The 5' gene fusion partner genomic breakpoint" + None, description="The 5' gene fusion partner genomic breakpoint" ) break_3prime: int = Field( - ..., description="The 3' gene fusion partner genomic breakpoint" + None, description="The 3' gene fusion partner genomic breakpoint" ) fusion_junction_sequence: str | None = Field( None, description="The sequence near the fusion junction" ) + def load_records(self, path: Path) -> list["EnFusion"]: + """Load fusions from EnFusion tsv file + + :param path: The path to the file of Enfusion fusions + :return A list of Enfusion objects, or None if the specified file does not exist + """ + column_rename = { + "Gene1": "gene_5prime", + "Gene2": "gene_3prime", + "Chr1": "chr_5prime", + "Chr2": "chr_3prime", + "Break1": "break_5prime", + "Break2": "break_3prime", + "FusionJunctionSequence": "fusion_junction_sequence", + } + return self._process_fusion_caller_rows(path, EnFusion, column_rename) + class Genie(BaseModel): """Define parameters for Genie model""" type: Literal[Caller.GENIE] = Caller.GENIE - site1_hugo: str = Field(..., description="The HUGO symbol reported at site 1") - site2_hugo: str = Field(..., description="The HUGO symbol reported at site 2") - site1_chrom: int = Field(..., description="The chromosome reported at site 1") - site2_chrom: int = Field(..., description="The chromosome reported at site 2") - site1_pos: int = Field(..., description="The breakpoint reported at site 1") - site2_pos: int = Field(..., description="The breakpoint reported at site 2") - annot: str = Field(..., description="The annotation for the fusion event") + site1_hugo: str = Field(None, description="The HUGO symbol reported at site 1") + site2_hugo: str = Field(None, description="The HUGO symbol reported at site 2") + site1_chrom: int = Field(None, description="The chromosome reported at site 1") + site2_chrom: int = Field(None, description="The chromosome reported at site 2") + site1_pos: int = Field(None, description="The breakpoint reported at site 1") + site2_pos: int = Field(None, description="The breakpoint reported at site 2") + annot: str = Field(None, description="The annotation for the fusion event") reading_frame: str = Field( - ..., description="The reading frame status of the fusion" + None, description="The reading frame status of the fusion" ) + + def load_records(self, path: Path) -> list["Genie"]: + """Load fusions from Genie txt file + + :param path: The path to the file of Genie structural variants + :return A list of Genie objects, or None if the specified file does not exist + """ + column_rename = { + "Site1_Hugo_Symbol": "site1_hugo", + "Site2_Hugo_Symbol": "site2_hugo", + "Site1_Chromosome": "site1_chrom", + "Site2_Chromosome": "site2_chrom", + "Site1_Position": "site1_pos", + "Site2_Position": "site2_pos", + "Site2_Effect_On_Frame": "reading_frame", + "Annotation": "annot", + } + return self._process_fusion_caller_rows(path, Genie, column_rename) diff --git a/tests/test_extractions.py b/tests/test_extractions.py index f9deb8b..7742e7f 100644 --- a/tests/test_extractions.py +++ b/tests/test_extractions.py @@ -13,17 +13,27 @@ get_jaffa_records, get_star_fusion_records, ) +from fusor.fusion_caller_models import JAFFA def test_get_jaffa_records(fixture_data_dir): """Test that get_jaffa_records works correctly""" - path = fixture_data_dir / "jaffa_results.csv" - fusions_list = get_jaffa_records(Path(path)) - assert len(fusions_list) == 491 + path = Path(fixture_data_dir / "jaffa_results.csv") + jaffa_instance = JAFFA() + records = jaffa_instance.load_records(path) + assert len(records) == 491 - path = fixture_data_dir / "jaffa_resultss.csv" + path = Path(fixture_data_dir / "jaffa_resultss.csv") with pytest.raises(ValueError, match=f"{path} does not exist"): - assert get_jaffa_records(path) + assert jaffa_instance.load_records(path) + + # path = fixture_data_dir / "jaffa_results.csv" + # fusions_list = get_jaffa_records(Path(path)) + # assert len(fusions_list) == 491 + + # path = fixture_data_dir / "jaffa_resultss.csv" + # with pytest.raises(ValueError, match=f"{path} does not exist"): + # assert get_jaffa_records(path) def test_get_star_fusion_records(fixture_data_dir): From 3b2c5e7f27c4742af464939a5f083958c4ec9c78 Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld Date: Thu, 23 Jan 2025 17:12:26 -0500 Subject: [PATCH 13/20] Add fusion caller models tests --- tests/test_fusion_caller_models.py | 107 +++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 tests/test_fusion_caller_models.py diff --git a/tests/test_fusion_caller_models.py b/tests/test_fusion_caller_models.py new file mode 100644 index 0000000..c2177cf --- /dev/null +++ b/tests/test_fusion_caller_models.py @@ -0,0 +1,107 @@ +"""Module for testing extraction methods""" + +from pathlib import Path + +import pytest + +from fusor.fusion_caller_models import ( + JAFFA, + Arriba, + Cicero, + EnFusion, + FusionCatcher, + Genie, + STARFusion, +) + + +def test_get_jaffa_records(fixture_data_dir): + """Test that get_jaffa_records works correctly""" + path = Path(fixture_data_dir / "jaffa_results.csv") + jaffa_instance = JAFFA() + records = jaffa_instance.load_records(path) + assert len(records) == 491 + + path = Path(fixture_data_dir / "jaffa_resultss.csv") + with pytest.raises(ValueError, match=f"{path} does not exist"): + assert jaffa_instance.load_records(path) + + +def test_get_star_fusion_records(fixture_data_dir): + """Test that get_star_fusion_records works correctly""" + path = Path(fixture_data_dir / "star-fusion.fusion_predictions.abridged.tsv") + sf_instance = STARFusion() + records = sf_instance.load_records(path) + assert len(records) == 37 + + path = Path(fixture_data_dir / "star-fusion.fusion_predictions.abridged.tsvs") + with pytest.raises(ValueError, match=f"{path} does not exist"): + assert sf_instance.load_records(path) + + # path = fixture_data_dir / "star-fusion.fusion_predictions.abridged.tsv" + # fusions_list = get_star_fusion_records(Path(path)) + # assert len(fusions_list) == 37 + + # path = fixture_data_dir / "star-fusion.fusion_predictions.abridged.tsvs" + # with pytest.raises(ValueError, match=f"{path} does not exist"): + # assert get_jaffa_records(path) + + +def test_get_fusion_catcher_records(fixture_data_dir): + """Test that get_fusion_catcher_records works correctly""" + path = Path(fixture_data_dir / "final-list_candidate-fusion-genes.txt") + fc_instance = FusionCatcher() + fusions_list = fc_instance.load_records(path) + assert len(fusions_list) == 355 + + path = Path(fixture_data_dir / "final-list_candidate-fusion-genes.txts") + with pytest.raises(ValueError, match=f"{path} does not exist"): + assert fc_instance.load_records(path) + + +def test_get_arriba_records(fixture_data_dir): + """Test that get_arriba_records works correctly""" + path = Path(fixture_data_dir / "fusions_arriba_test.tsv") + arriba = Arriba() + fusions_list = arriba.load_records(path) + assert len(fusions_list) == 1 + + path = Path(fixture_data_dir / "fusionsd_arriba_test.tsv") + with pytest.raises(ValueError, match=f"{path} does not exist"): + assert arriba.load_records(path) + + +def test_get_cicero_records(fixture_data_dir): + """Test that get_cicero_records works correctly""" + path = Path(fixture_data_dir / "annotated.fusion.txt") + cicero = Cicero() + fusions_list = cicero.load_records(path) + assert len(fusions_list) == 1 + + path = Path(fixture_data_dir / "annnotated.fusion.txt") + with pytest.raises(ValueError, match=f"{path} does not exist"): + assert cicero.load_records(path) + + +def test_get_enfusion_records(fixture_data_dir): + """Test that get_enfusion_records works correctly""" + path = Path(fixture_data_dir / "enfusion_test.csv") + enfusion = EnFusion() + fusions_list = enfusion.load_records(path) + assert len(fusions_list) == 1 + + path = Path(fixture_data_dir / "enfusions_test.csv") + with pytest.raises(ValueError, match=f"{path} does not exist"): + assert enfusion.load_records(path) + + +def test_get_genie_records(fixture_data_dir): + """Test that get_genie_records works correctly""" + path = Path(fixture_data_dir / "genie_test.txt") + genie = Genie() + fusions_list = genie.load_records(path) + assert len(fusions_list) == 1 + + path = Path(fixture_data_dir / "genie_tests.txt") + with pytest.raises(ValueError, match=f"{path} does not exist"): + assert genie.load_records(path) From 8234e715c25737ede9b7ba4b12cb925687428861 Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld Date: Thu, 23 Jan 2025 17:21:50 -0500 Subject: [PATCH 14/20] Move extraction methods into fusion caller classes --- src/fusor/extract.py | 175 ------------------------------ src/fusor/fusion_caller_models.py | 24 ++-- tests/test_extractions.py | 102 ----------------- 3 files changed, 14 insertions(+), 287 deletions(-) delete mode 100644 src/fusor/extract.py delete mode 100644 tests/test_extractions.py diff --git a/src/fusor/extract.py b/src/fusor/extract.py deleted file mode 100644 index 4418845..0000000 --- a/src/fusor/extract.py +++ /dev/null @@ -1,175 +0,0 @@ -"""Module for extracting data from fusion caller output and coverting to pydantic -objects -""" - -import csv -from pathlib import Path - -from fusor.fusion_caller_models import ( - JAFFA, - Arriba, - Cicero, - EnFusion, - FusionCaller, - FusionCatcher, - Genie, - STARFusion, -) - - -def _does_file_exist(path: Path) -> None: - """Check if fusions file exists - - :param path: The path to the file - :return None - :raise ValueError if the file does not exist at the specified path - """ - if not path.exists(): - statement = f"{path!s} does not exist" - raise ValueError(statement) - return - - -def _process_fusion_caller_rows( - path: Path, caller: FusionCaller, column_rename: dict -) -> list[FusionCaller]: - """Convert rows of fusion caller output to Pydantic classes - - :param path: The path to the fusions file - :param caller: The name of the fusion caller - :param column_rename: A dictionary of column mappings - :return: A list of fusions, represented as Pydantic objects - """ - _does_file_exist(path) - fusions_list = [] - with path.open() as csvfile: - reader = csv.DictReader(csvfile, delimiter="," if caller == JAFFA else "\t") - for row in reader: - row = {column_rename.get(key, key): value for key, value in row.items()} - fusions_list.append(caller(**row)) - return fusions_list - - -def get_jaffa_records(path: Path) -> list[JAFFA]: - """Load fusions from JAFFA csv file - - :param path: The path to the file of JAFFA fusions - :return A list of JAFFA objects, or None if the specified file does not exist - """ - column_rename = { - "fusion genes": "fusion_genes", - "spanning reads": "spanning_reads", - "spanning pairs": "spanning_pairs", - } - return _process_fusion_caller_rows(path, JAFFA, column_rename) - - -def get_star_fusion_records(path: Path) -> list[STARFusion] | None: - """Load fusions from STAR-Fusion tsv file - - :param path: The path to the file of STAR-Fusion fusions - :return A list of STAR-Fusion objects, or None if the specified file does not exist - """ - column_rename = { - "LeftGene": "left_gene", - "RightGene": "right_gene", - "LeftBreakpoint": "left_breakpoint", - "RightBreakpoint": "right_breakpoint", - "JunctionReadCount": "junction_read_count", - "SpanningFragCount": "spanning_frag_count", - } - return _process_fusion_caller_rows(path, STARFusion, column_rename) - - -def get_fusion_catcher_records(path: Path) -> list[FusionCatcher] | None: - """Load fusions from FusionCatcher txt file - - :param path: The path to the file of FusionCatcher fusions - :return A list of FusionCatcher objects, or None if the specified file does not exist - """ - column_rename = { - "Gene_1_symbol(5end_fusion_partner)": "five_prime_partner", - "Gene_2_symbol(3end_fusion_partner)": "three_prime_partner", - "Fusion_point_for_gene_1(5end_fusion_partner)": "five_prime_fusion_point", - "Fusion_point_for_gene_2(3end_fusion_partner)": "three_prime_fusion_point", - "Predicted_effect": "predicted_effect", - "Spanning_unique_reads": "spanning_unique_reads", - "Spanning_pairs": "spanning_reads", - "Fusion_sequence": "fusion_sequence", - } - return _process_fusion_caller_rows(path, FusionCatcher, column_rename) - - -def get_arriba_records(path: Path) -> list[Arriba] | None: - """Load fusions from Arriba tsv file - - :param path: The path to the file of Arriba fusions - :return A list of Arriba objects, or None if the specified file does not exist - """ - column_rename = { - "#gene1": "gene1", - "strand1(gene/fusion)": "strand1", - "strand2(gene/fusion)": "strand2", - "type": "event_type", - "reading_frame": "rf", - } - return _process_fusion_caller_rows(path, Arriba, column_rename) - - -def get_cicero_records(path: Path) -> list[Cicero] | None: - """Load fusions from Cicero txt file - - :param path: The path to the file of Cicero fusions - :return A list of Cicero objects, or None if the specified file does not exist - """ - column_rename = { - "geneA": "gene_5prime", - "geneB": "gene_3prime", - "chrA": "chr_5prime", - "chrB": "chr_3prime", - "posA": "pos_5prime", - "posB": "pos_3prime", - "type": "event_type", - "readsA": "reads_5prime", - "readsB": "reads_3prime", - "coverageA": "coverage_5prime", - "coverageB": "coverage_3prime", - } - return _process_fusion_caller_rows(path, Cicero, column_rename) - - -def get_enfusion_records(path: Path) -> list[EnFusion] | None: - """Load fusions from EnFusion tsv file - - :param path: The path to the file of Enfusion fusions - :return A list of Enfusion objects, or None if the specified file does not exist - """ - column_rename = { - "Gene1": "gene_5prime", - "Gene2": "gene_3prime", - "Chr1": "chr_5prime", - "Chr2": "chr_3prime", - "Break1": "break_5prime", - "Break2": "break_3prime", - "FusionJunctionSequence": "fusion_junction_sequence", - } - return _process_fusion_caller_rows(path, EnFusion, column_rename) - - -def get_genie_records(path: Path) -> list[Genie] | None: - """Load fusions from Genie txt file - - :param path: The path to the file of Genie structural variants - :return A list of Genie objects, or None if the specified file does not exist - """ - column_rename = { - "Site1_Hugo_Symbol": "site1_hugo", - "Site2_Hugo_Symbol": "site2_hugo", - "Site1_Chromosome": "site1_chrom", - "Site2_Chromosome": "site2_chrom", - "Site1_Position": "site1_pos", - "Site2_Position": "site2_pos", - "Site2_Effect_On_Frame": "reading_frame", - "Annotation": "annot", - } - return _process_fusion_caller_rows(path, Genie, column_rename) diff --git a/src/fusor/fusion_caller_models.py b/src/fusor/fusion_caller_models.py index 3afc903..13047dd 100644 --- a/src/fusor/fusion_caller_models.py +++ b/src/fusor/fusion_caller_models.py @@ -123,11 +123,13 @@ def load_records(self, path: Path) -> list["JAFFA"]: return self._process_fusion_caller_rows(path, JAFFA, column_rename) -class STARFusion(BaseModel): +class STARFusion(FusionCaller): """Define parameters for STAR-Fusion model""" type: Literal[Caller.STAR_FUSION] = Caller.STAR_FUSION - left_gene: str = Field(..., description="The gene indicated in the LeftGene column") + left_gene: str = Field( + None, description="The gene indicated in the LeftGene column" + ) right_gene: str = Field( None, description="The gene indicated in the RightGene column" ) @@ -164,7 +166,7 @@ def load_records(self, path: Path) -> list["STARFusion"]: return self._process_fusion_caller_rows(path, STARFusion, column_rename) -class FusionCatcher(BaseModel): +class FusionCatcher(FusionCaller): """Define parameters for FusionCatcher model""" type: Literal[Caller.FUSION_CATCHER] = Caller.FUSION_CATCHER @@ -215,12 +217,12 @@ def load_records(self, path: Path) -> list["FusionCatcher"]: return self._process_fusion_caller_rows(path, FusionCatcher, column_rename) -class Arriba(BaseModel): +class Arriba(FusionCaller): """Define parameters for Arriba model""" type: Literal[Caller.ARRIBA] = Caller.ARRIBA - gene1: str = Field(..., description="The 5' gene fusion partner") - gene2: str = Field(..., description="The 3' gene fusion partner") + gene1: str = Field(None, description="The 5' gene fusion partner") + gene2: str = Field(None, description="The 3' gene fusion partner") strand1: str = Field( None, description="The strand information for the 5' gene fusion partner" ) @@ -284,7 +286,7 @@ def load_records(self, path: Path) -> list["Arriba"]: return self._process_fusion_caller_rows(path, Arriba, column_rename) -class Cicero(BaseModel): +class Cicero(FusionCaller): """Define parameters for CICERO model""" type: Literal[Caller.CICERO] = Caller.CICERO @@ -320,7 +322,9 @@ class Cicero(BaseModel): coverage_3prime: int = Field( None, description="The fragment coverage at the 3' breakpoint" ) - contig: str = Field(..., description="The assembled contig sequence for the fusion") + contig: str = Field( + None, description="The assembled contig sequence for the fusion" + ) def load_records(self, path: Path) -> list["Cicero"]: """Load fusions from Cicero txt file @@ -344,7 +348,7 @@ def load_records(self, path: Path) -> list["Cicero"]: return self._process_fusion_caller_rows(path, Cicero, column_rename) -class EnFusion(BaseModel): +class EnFusion(FusionCaller): """Define parameters for EnFusion model""" type: Literal[Caller.ENFUSION] = Caller.ENFUSION @@ -380,7 +384,7 @@ def load_records(self, path: Path) -> list["EnFusion"]: return self._process_fusion_caller_rows(path, EnFusion, column_rename) -class Genie(BaseModel): +class Genie(FusionCaller): """Define parameters for Genie model""" type: Literal[Caller.GENIE] = Caller.GENIE diff --git a/tests/test_extractions.py b/tests/test_extractions.py deleted file mode 100644 index 7742e7f..0000000 --- a/tests/test_extractions.py +++ /dev/null @@ -1,102 +0,0 @@ -"""Module for testing extraction methods""" - -from pathlib import Path - -import pytest - -from fusor.extract import ( - get_arriba_records, - get_cicero_records, - get_enfusion_records, - get_fusion_catcher_records, - get_genie_records, - get_jaffa_records, - get_star_fusion_records, -) -from fusor.fusion_caller_models import JAFFA - - -def test_get_jaffa_records(fixture_data_dir): - """Test that get_jaffa_records works correctly""" - path = Path(fixture_data_dir / "jaffa_results.csv") - jaffa_instance = JAFFA() - records = jaffa_instance.load_records(path) - assert len(records) == 491 - - path = Path(fixture_data_dir / "jaffa_resultss.csv") - with pytest.raises(ValueError, match=f"{path} does not exist"): - assert jaffa_instance.load_records(path) - - # path = fixture_data_dir / "jaffa_results.csv" - # fusions_list = get_jaffa_records(Path(path)) - # assert len(fusions_list) == 491 - - # path = fixture_data_dir / "jaffa_resultss.csv" - # with pytest.raises(ValueError, match=f"{path} does not exist"): - # assert get_jaffa_records(path) - - -def test_get_star_fusion_records(fixture_data_dir): - """Test that get_star_fusion_records works correctly""" - path = fixture_data_dir / "star-fusion.fusion_predictions.abridged.tsv" - fusions_list = get_star_fusion_records(Path(path)) - assert len(fusions_list) == 37 - - path = fixture_data_dir / "star-fusion.fusion_predictions.abridged.tsvs" - with pytest.raises(ValueError, match=f"{path} does not exist"): - assert get_jaffa_records(path) - - -def test_get_fusion_catcher_records(fixture_data_dir): - """Test that get_fusion_catcher_records works correctly""" - path = fixture_data_dir / "final-list_candidate-fusion-genes.txt" - fusions_list = get_fusion_catcher_records(Path(path)) - assert len(fusions_list) == 355 - - path = fixture_data_dir / "final-list_candidate-fusion-genes.txts" - with pytest.raises(ValueError, match=f"{path} does not exist"): - assert get_jaffa_records(path) - - -def test_get_arriba_records(fixture_data_dir): - """Test that get_arriba_records works correctly""" - path = fixture_data_dir / "fusions_arriba_test.tsv" - fusions_list = get_arriba_records(Path(path)) - assert len(fusions_list) == 1 - - path = fixture_data_dir / "fusionsd_arriba_test.tsv" - with pytest.raises(ValueError, match=f"{path} does not exist"): - assert get_jaffa_records(path) - - -def test_get_cicero_records(fixture_data_dir): - """Test that get_cicero_records works correctly""" - path = fixture_data_dir / "annotated.fusion.txt" - fusions_list = get_cicero_records(Path(path)) - assert len(fusions_list) == 1 - - path = fixture_data_dir / "annnotated.fusion.txt" - with pytest.raises(ValueError, match=f"{path} does not exist"): - assert get_jaffa_records(path) - - -def test_get_enfusion_records(fixture_data_dir): - """Test that get_enfusion_records works correctly""" - path = fixture_data_dir / "enfusion_test.csv" - fusions_list = get_enfusion_records(Path(path)) - assert len(fusions_list) == 1 - - path = fixture_data_dir / "enfusions_test.csv" - with pytest.raises(ValueError, match=f"{path} does not exist"): - assert get_jaffa_records(path) - - -def test_get_genie_records(fixture_data_dir): - """Test that get_genie_records works correctly""" - path = fixture_data_dir / "genie_test.txt" - fusions_list = get_genie_records(Path(path)) - assert len(fusions_list) == 1 - - path = fixture_data_dir / "genie_tests.txt" - with pytest.raises(ValueError, match=f"{path} does not exist"): - assert get_jaffa_records(path) From 91bd5bc8e9db1f7ed0e507cf1fc99002d3e707e5 Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld Date: Thu, 23 Jan 2025 17:24:52 -0500 Subject: [PATCH 15/20] Rename file and remove commented out code --- tests/test_fusion_caller_models.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/tests/test_fusion_caller_models.py b/tests/test_fusion_caller_models.py index c2177cf..a3191a9 100644 --- a/tests/test_fusion_caller_models.py +++ b/tests/test_fusion_caller_models.py @@ -1,4 +1,4 @@ -"""Module for testing extraction methods""" +"""Module for testing fusion caller classes""" from pathlib import Path @@ -38,14 +38,6 @@ def test_get_star_fusion_records(fixture_data_dir): with pytest.raises(ValueError, match=f"{path} does not exist"): assert sf_instance.load_records(path) - # path = fixture_data_dir / "star-fusion.fusion_predictions.abridged.tsv" - # fusions_list = get_star_fusion_records(Path(path)) - # assert len(fusions_list) == 37 - - # path = fixture_data_dir / "star-fusion.fusion_predictions.abridged.tsvs" - # with pytest.raises(ValueError, match=f"{path} does not exist"): - # assert get_jaffa_records(path) - def test_get_fusion_catcher_records(fixture_data_dir): """Test that get_fusion_catcher_records works correctly""" From 3b7a50a7a0926b11b0fed74d81cb3c563c27f219 Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld Date: Fri, 24 Jan 2025 11:30:49 -0500 Subject: [PATCH 16/20] Add static and class methods --- src/fusor/fusion_caller_models.py | 210 ++++++++++++++--------------- tests/test_fusion_caller_models.py | 35 ++--- 2 files changed, 118 insertions(+), 127 deletions(-) diff --git a/src/fusor/fusion_caller_models.py b/src/fusor/fusion_caller_models.py index 13047dd..4b448bb 100644 --- a/src/fusor/fusion_caller_models.py +++ b/src/fusor/fusion_caller_models.py @@ -6,7 +6,7 @@ from pathlib import Path from typing import Literal -from pydantic import BaseModel, Field +from pydantic import BaseModel, ConfigDict, Field class Caller(str, Enum): @@ -27,13 +27,10 @@ class FusionCaller(ABC, BaseModel): """ABC for fusion callers""" type: Caller + model_config = ConfigDict(extra="allow") - class Config: - """Allow extra fields from fusion callers to be provided""" - - extra = "allow" - - def _does_file_exist(self, path: Path) -> None: + @staticmethod + def _does_file_exist(path: Path) -> None: """Check if fusions file exists :param path: The path to the file @@ -45,25 +42,27 @@ def _does_file_exist(self, path: Path) -> None: raise ValueError(statement) return + @classmethod def _process_fusion_caller_rows( - self, path: Path, caller: Caller, column_rename: dict - ) -> list[Caller]: + cls, + path: Path, + column_rename: dict, + delimeter: str, + ) -> list["FusionCaller"]: """Convert rows of fusion caller output to Pydantic classes :param path: The path to the fusions file - :param caller: The name of the fusion caller :param column_rename: A dictionary of column mappings + :param delimeter: The delimeter for the fusions file :return: A list of fusions, represented as Pydantic objects """ - self._does_file_exist(path) + cls._does_file_exist(path) fusions_list = [] with path.open() as csvfile: - reader = csv.DictReader( - csvfile, delimiter="," if caller == Caller.JAFFA else "\t" - ) + reader = csv.DictReader(csvfile, delimiter=delimeter) for row in reader: row = {column_rename.get(key, key): value for key, value in row.items()} - fusions_list.append(caller(**row)) + fusions_list.append(cls(**row)) return fusions_list @abstractmethod @@ -76,40 +75,41 @@ class JAFFA(FusionCaller): type: Literal[Caller.JAFFA] = Caller.JAFFA fusion_genes: str = Field( - None, description="A string containing the two fusion partners" + ..., description="A string containing the two fusion partners" ) chrom1: str = Field( - None, description="The chromosome indicated in the chrom1 column" + ..., description="The chromosome indicated in the chrom1 column" ) base1: int = Field( - None, description="The genomic position indicated in the base1 column" + ..., description="The genomic position indicated in the base1 column" ) chrom2: str = Field( - None, description="The chromosome indicated in the chrom2 column" + ..., description="The chromosome indicated in the chrom2 column" ) base2: int = Field( - None, description="The genomic position indicated in the base2 column" + ..., description="The genomic position indicated in the base2 column" ) rearrangement: bool = Field( - None, description=" A boolean indicating if a rearrangement occurred" + ..., description=" A boolean indicating if a rearrangement occurred" ) classification: str = Field( - None, description="The classification associated with the called fusion" + ..., description="The classification associated with the called fusion" ) inframe: bool | str = Field( - None, + ..., description="A boolean or string indicating if the fusion occurred in-frame", ) spanning_reads: int = Field( - None, + ..., description="The number of detected reads that span the junction between the two transcript. Although described as spanning reads, this aligns with our definition of split reads i.e. reads that have sequence belonging to the two fusion partners", ) spanning_pairs: int = Field( - None, + ..., description="The number of detected reads that align entirely on either side of the breakpoint", ) - def load_records(self, path: Path) -> list["JAFFA"]: + @classmethod + def load_records(cls, path: Path) -> list["JAFFA"]: """Load fusions from JAFFA csv file :param path: The path to the file of JAFFA fusions @@ -120,36 +120,35 @@ def load_records(self, path: Path) -> list["JAFFA"]: "spanning reads": "spanning_reads", "spanning pairs": "spanning_pairs", } - return self._process_fusion_caller_rows(path, JAFFA, column_rename) + return cls._process_fusion_caller_rows(path, column_rename, ",") class STARFusion(FusionCaller): """Define parameters for STAR-Fusion model""" type: Literal[Caller.STAR_FUSION] = Caller.STAR_FUSION - left_gene: str = Field( - None, description="The gene indicated in the LeftGene column" - ) + left_gene: str = Field(..., description="The gene indicated in the LeftGene column") right_gene: str = Field( - None, description="The gene indicated in the RightGene column" + ..., description="The gene indicated in the RightGene column" ) left_breakpoint: str = Field( - None, description="The gene indicated in the LeftBreakpoint column" + ..., description="The gene indicated in the LeftBreakpoint column" ) right_breakpoint: str = Field( - None, description="The gene indicated in the RightBreakpoint column" + ..., description="The gene indicated in the RightBreakpoint column" ) - annots: str = Field(None, description="The annotations associated with the fusion") + annots: str = Field(..., description="The annotations associated with the fusion") junction_read_count: int = Field( - None, + ..., description="The number of RNA-seq fragments that split the junction between the two transcript segments (from STAR-Fusion documentation)", ) spanning_frag_count: int = Field( - None, + ..., description="The number of RNA-seq fragments that encompass the fusion junction such that one read of the pair aligns to a different gene than the other paired-end read of that fragment (from STAR-Fusion documentation)", ) - def load_records(self, path: Path) -> list["STARFusion"]: + @classmethod + def load_records(cls, path: Path) -> list["STARFusion"]: """Load fusions from STAR-Fusion tsv file :param path: The path to the file of STAR-Fusion fusions @@ -163,7 +162,7 @@ def load_records(self, path: Path) -> list["STARFusion"]: "JunctionReadCount": "junction_read_count", "SpanningFragCount": "spanning_frag_count", } - return self._process_fusion_caller_rows(path, STARFusion, column_rename) + return cls._process_fusion_caller_rows(path, column_rename, "\t") class FusionCatcher(FusionCaller): @@ -171,34 +170,35 @@ class FusionCatcher(FusionCaller): type: Literal[Caller.FUSION_CATCHER] = Caller.FUSION_CATCHER five_prime_partner: str = Field( - None, description="Gene symbol for the 5' fusion partner" + ..., description="Gene symbol for the 5' fusion partner" ) three_prime_partner: str = Field( - None, description="Gene symbol for the 3' fusion partner" + ..., description="Gene symbol for the 3' fusion partner" ) five_prime_fusion_point: str = Field( - None, + ..., description="Chromosomal position for the 5' end of the fusion junction. This coordinate is 1-based", ) three_prime_fusion_point: str = Field( - None, + ..., description="Chromosomal position for the 3' end of the fusion junction. This coordinate is 1-based", ) predicted_effect: str = Field( - None, + ..., description="The predicted effect of the fusion event, created using annotation from the Ensembl database", ) spanning_unique_reads: int = Field( - None, description="The number of unique reads that map on the fusion junction" + ..., description="The number of unique reads that map on the fusion junction" ) spanning_reads: int = Field( - None, description="The number of paired reads that support the fusion" + ..., description="The number of paired reads that support the fusion" ) fusion_sequence: str = Field( - None, description="The inferred sequence around the fusion junction" + ..., description="The inferred sequence around the fusion junction" ) - def load_records(self, path: Path) -> list["FusionCatcher"]: + @classmethod + def load_records(cls, path: Path) -> list["FusionCatcher"]: """Load fusions from FusionCatcher txt file :param path: The path to the file of FusionCatcher fusions @@ -214,63 +214,60 @@ def load_records(self, path: Path) -> list["FusionCatcher"]: "Spanning_pairs": "spanning_reads", "Fusion_sequence": "fusion_sequence", } - return self._process_fusion_caller_rows(path, FusionCatcher, column_rename) + return cls._process_fusion_caller_rows(path, column_rename, "\t") class Arriba(FusionCaller): """Define parameters for Arriba model""" type: Literal[Caller.ARRIBA] = Caller.ARRIBA - gene1: str = Field(None, description="The 5' gene fusion partner") - gene2: str = Field(None, description="The 3' gene fusion partner") + gene1: str = Field(..., description="The 5' gene fusion partner") + gene2: str = Field(..., description="The 3' gene fusion partner") strand1: str = Field( - None, description="The strand information for the 5' gene fusion partner" + ..., description="The strand information for the 5' gene fusion partner" ) strand2: str = Field( - None, description="The strand information for the 3' gene fusion partner" - ) - breakpoint1: str = Field( - None, description="The chromosome and breakpoint for gene1" - ) - breakpoint2: str = Field( - None, description="The chromosome and breakpoint for gene2" + ..., description="The strand information for the 3' gene fusion partner" ) + breakpoint1: str = Field(..., description="The chromosome and breakpoint for gene1") + breakpoint2: str = Field(..., description="The chromosome and breakpoint for gene2") event_type: str = Field( - None, description=" An inference about the type of fusion event" + ..., description=" An inference about the type of fusion event" ) confidence: str = Field( - None, description="A metric describing the confidence of the fusion prediction" + ..., description="A metric describing the confidence of the fusion prediction" ) direction1: str = Field( - None, + ..., description="A description that indicates if the transcript segment starts or ends at breakpoint1", ) direction2: str = Field( - None, + ..., description="A description that indicates if the transcript segment starts or ends at breakpoint2", ) rf: str = Field( - None, + ..., description="A description if the reading frame is preserved for the fusion", ) split_reads1: int = Field( - None, description="Number of supporting split fragments with anchor in gene1" + ..., description="Number of supporting split fragments with anchor in gene1" ) split_reads2: int = Field( - None, description="Number of supporting split fragments with anchor in gene2" + ..., description="Number of supporting split fragments with anchor in gene2" ) discordant_mates: int = Field( - None, description="Number of discordant mates supporting the fusion" + ..., description="Number of discordant mates supporting the fusion" ) coverage1: int = Field( - None, description="Number of fragments retained near breakpoint1" + ..., description="Number of fragments retained near breakpoint1" ) coverage2: int = Field( - None, description="Number of fragments retained near breakpoint2" + ..., description="Number of fragments retained near breakpoint2" ) - fusion_transcript: str = Field(None, description="The assembled fusion transcript") + fusion_transcript: str = Field(..., description="The assembled fusion transcript") - def load_records(self, path: Path) -> list["Arriba"]: + @classmethod + def load_records(cls, path: Path) -> list["Arriba"]: """Load fusions from Arriba tsv file :param path: The path to the file of Arriba fusions @@ -283,50 +280,49 @@ def load_records(self, path: Path) -> list["Arriba"]: "type": "event_type", "reading_frame": "rf", } - return self._process_fusion_caller_rows(path, Arriba, column_rename) + return cls._process_fusion_caller_rows(path, column_rename, "\t") class Cicero(FusionCaller): """Define parameters for CICERO model""" type: Literal[Caller.CICERO] = Caller.CICERO - gene_5prime: str = Field(None, description="The gene symbol for the 5' partner") - gene_3prime: str = Field(None, description="The gene symbol for the 3' partner") - chr_5prime: str = Field(None, description="The chromosome for the 5' partner") - chr_3prime: str = Field(None, description="The chromosome for the 3' partner") + gene_5prime: str = Field(..., description="The gene symbol for the 5' partner") + gene_3prime: str = Field(..., description="The gene symbol for the 3' partner") + chr_5prime: str = Field(..., description="The chromosome for the 5' partner") + chr_3prime: str = Field(..., description="The chromosome for the 3' partner") pos_5prime: int = Field( - None, description="The genomic breakpoint for the 5' partner" + ..., description="The genomic breakpoint for the 5' partner" ) pos_3prime: int = Field( - None, description="The genomic breakpoint for the 3' partner" + ..., description="The genomic breakpoint for the 3' partner" ) sv_ort: str = Field( - None, + ..., description="Whether the mapping orientation of assembled contig (driven by structural variation) has confident biological meaning", ) event_type: str = Field( - None, + ..., description="The structural variation event that created the called fusion", ) reads_5prime: int = Field( - None, + ..., description="The number of reads that support the breakpoint for the 5' partner", ) reads_3prime: int = Field( - None, + ..., description="The number of reads that support the breakpoint for the 3' partner", ) coverage_5prime: int = Field( - None, description="The fragment coverage at the 5' breakpoint" + ..., description="The fragment coverage at the 5' breakpoint" ) coverage_3prime: int = Field( - None, description="The fragment coverage at the 3' breakpoint" - ) - contig: str = Field( - None, description="The assembled contig sequence for the fusion" + ..., description="The fragment coverage at the 3' breakpoint" ) + contig: str = Field(..., description="The assembled contig sequence for the fusion") - def load_records(self, path: Path) -> list["Cicero"]: + @classmethod + def load_records(cls, path: Path) -> list["Cicero"]: """Load fusions from Cicero txt file :param path: The path to the file of Cicero fusions @@ -345,28 +341,29 @@ def load_records(self, path: Path) -> list["Cicero"]: "coverageA": "coverage_5prime", "coverageB": "coverage_3prime", } - return self._process_fusion_caller_rows(path, Cicero, column_rename) + return cls._process_fusion_caller_rows(path, column_rename, "\t") class EnFusion(FusionCaller): """Define parameters for EnFusion model""" type: Literal[Caller.ENFUSION] = Caller.ENFUSION - gene_5prime: str = Field(None, description="The 5' gene fusion partner") - gene_3prime: str = Field(None, description="The 3' gene fusion partner") - chr_5prime: int = Field(None, description="The 5' gene fusion partner chromosome") - chr_3prime: int = Field(None, description="The 3' gene fusion partner chromosome") + gene_5prime: str = Field(..., description="The 5' gene fusion partner") + gene_3prime: str = Field(..., description="The 3' gene fusion partner") + chr_5prime: int = Field(..., description="The 5' gene fusion partner chromosome") + chr_3prime: int = Field(..., description="The 3' gene fusion partner chromosome") break_5prime: int = Field( - None, description="The 5' gene fusion partner genomic breakpoint" + ..., description="The 5' gene fusion partner genomic breakpoint" ) break_3prime: int = Field( - None, description="The 3' gene fusion partner genomic breakpoint" + ..., description="The 3' gene fusion partner genomic breakpoint" ) fusion_junction_sequence: str | None = Field( None, description="The sequence near the fusion junction" ) - def load_records(self, path: Path) -> list["EnFusion"]: + @classmethod + def load_records(cls, path: Path) -> list["EnFusion"]: """Load fusions from EnFusion tsv file :param path: The path to the file of Enfusion fusions @@ -381,25 +378,26 @@ def load_records(self, path: Path) -> list["EnFusion"]: "Break2": "break_3prime", "FusionJunctionSequence": "fusion_junction_sequence", } - return self._process_fusion_caller_rows(path, EnFusion, column_rename) + return cls._process_fusion_caller_rows(path, column_rename, "\t") class Genie(FusionCaller): """Define parameters for Genie model""" type: Literal[Caller.GENIE] = Caller.GENIE - site1_hugo: str = Field(None, description="The HUGO symbol reported at site 1") - site2_hugo: str = Field(None, description="The HUGO symbol reported at site 2") - site1_chrom: int = Field(None, description="The chromosome reported at site 1") - site2_chrom: int = Field(None, description="The chromosome reported at site 2") - site1_pos: int = Field(None, description="The breakpoint reported at site 1") - site2_pos: int = Field(None, description="The breakpoint reported at site 2") - annot: str = Field(None, description="The annotation for the fusion event") + site1_hugo: str = Field(..., description="The HUGO symbol reported at site 1") + site2_hugo: str = Field(..., description="The HUGO symbol reported at site 2") + site1_chrom: int = Field(..., description="The chromosome reported at site 1") + site2_chrom: int = Field(..., description="The chromosome reported at site 2") + site1_pos: int = Field(..., description="The breakpoint reported at site 1") + site2_pos: int = Field(..., description="The breakpoint reported at site 2") + annot: str = Field(..., description="The annotation for the fusion event") reading_frame: str = Field( - None, description="The reading frame status of the fusion" + ..., description="The reading frame status of the fusion" ) - def load_records(self, path: Path) -> list["Genie"]: + @classmethod + def load_records(cls, path: Path) -> list["Genie"]: """Load fusions from Genie txt file :param path: The path to the file of Genie structural variants @@ -415,4 +413,4 @@ def load_records(self, path: Path) -> list["Genie"]: "Site2_Effect_On_Frame": "reading_frame", "Annotation": "annot", } - return self._process_fusion_caller_rows(path, Genie, column_rename) + return cls._process_fusion_caller_rows(path, column_rename, "\t") diff --git a/tests/test_fusion_caller_models.py b/tests/test_fusion_caller_models.py index a3191a9..5a2cea5 100644 --- a/tests/test_fusion_caller_models.py +++ b/tests/test_fusion_caller_models.py @@ -18,82 +18,75 @@ def test_get_jaffa_records(fixture_data_dir): """Test that get_jaffa_records works correctly""" path = Path(fixture_data_dir / "jaffa_results.csv") - jaffa_instance = JAFFA() - records = jaffa_instance.load_records(path) + records = JAFFA.load_records(path) assert len(records) == 491 path = Path(fixture_data_dir / "jaffa_resultss.csv") with pytest.raises(ValueError, match=f"{path} does not exist"): - assert jaffa_instance.load_records(path) + assert JAFFA.load_records(path) def test_get_star_fusion_records(fixture_data_dir): """Test that get_star_fusion_records works correctly""" path = Path(fixture_data_dir / "star-fusion.fusion_predictions.abridged.tsv") - sf_instance = STARFusion() - records = sf_instance.load_records(path) + records = STARFusion.load_records(path) assert len(records) == 37 path = Path(fixture_data_dir / "star-fusion.fusion_predictions.abridged.tsvs") with pytest.raises(ValueError, match=f"{path} does not exist"): - assert sf_instance.load_records(path) + assert STARFusion.load_records(path) def test_get_fusion_catcher_records(fixture_data_dir): """Test that get_fusion_catcher_records works correctly""" path = Path(fixture_data_dir / "final-list_candidate-fusion-genes.txt") - fc_instance = FusionCatcher() - fusions_list = fc_instance.load_records(path) + fusions_list = FusionCatcher.load_records(path) assert len(fusions_list) == 355 path = Path(fixture_data_dir / "final-list_candidate-fusion-genes.txts") with pytest.raises(ValueError, match=f"{path} does not exist"): - assert fc_instance.load_records(path) + assert FusionCatcher.load_records(path) def test_get_arriba_records(fixture_data_dir): """Test that get_arriba_records works correctly""" path = Path(fixture_data_dir / "fusions_arriba_test.tsv") - arriba = Arriba() - fusions_list = arriba.load_records(path) + fusions_list = Arriba.load_records(path) assert len(fusions_list) == 1 path = Path(fixture_data_dir / "fusionsd_arriba_test.tsv") with pytest.raises(ValueError, match=f"{path} does not exist"): - assert arriba.load_records(path) + assert Arriba.load_records(path) def test_get_cicero_records(fixture_data_dir): """Test that get_cicero_records works correctly""" path = Path(fixture_data_dir / "annotated.fusion.txt") - cicero = Cicero() - fusions_list = cicero.load_records(path) + fusions_list = Cicero.load_records(path) assert len(fusions_list) == 1 path = Path(fixture_data_dir / "annnotated.fusion.txt") with pytest.raises(ValueError, match=f"{path} does not exist"): - assert cicero.load_records(path) + assert Cicero.load_records(path) def test_get_enfusion_records(fixture_data_dir): """Test that get_enfusion_records works correctly""" path = Path(fixture_data_dir / "enfusion_test.csv") - enfusion = EnFusion() - fusions_list = enfusion.load_records(path) + fusions_list = EnFusion.load_records(path) assert len(fusions_list) == 1 path = Path(fixture_data_dir / "enfusions_test.csv") with pytest.raises(ValueError, match=f"{path} does not exist"): - assert enfusion.load_records(path) + assert EnFusion.load_records(path) def test_get_genie_records(fixture_data_dir): """Test that get_genie_records works correctly""" path = Path(fixture_data_dir / "genie_test.txt") - genie = Genie() - fusions_list = genie.load_records(path) + fusions_list = Genie.load_records(path) assert len(fusions_list) == 1 path = Path(fixture_data_dir / "genie_tests.txt") with pytest.raises(ValueError, match=f"{path} does not exist"): - assert genie.load_records(path) + assert Genie.load_records(path) From 4e00a598b615a26a3f31aec7a90207b5d40fde6c Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld Date: Fri, 24 Jan 2025 11:33:37 -0500 Subject: [PATCH 17/20] Fix docstring --- src/fusor/fusion_caller_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fusor/fusion_caller_models.py b/src/fusor/fusion_caller_models.py index 4b448bb..a331429 100644 --- a/src/fusor/fusion_caller_models.py +++ b/src/fusor/fusion_caller_models.py @@ -66,7 +66,7 @@ def _process_fusion_caller_rows( return fusions_list @abstractmethod - def load_records(self, path: Path) -> list[Caller]: + def load_records(self, path: Path) -> list["FusionCaller"]: """Abstract method to load records from a fusion caller file.""" From a722022e4061d5f10ef3bc91f0dad180afbf7557 Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld Date: Fri, 24 Jan 2025 16:28:06 -0500 Subject: [PATCH 18/20] Store latest changes --- src/fusor/fusion_caller_models.py | 169 +---------------------------- src/fusor/harvester.py | 166 ++++++++++++++++++++++++++++ tests/test_fusion_caller_models.py | 7 +- tests/test_harvesters.py | 99 +++++++++++++++++ 4 files changed, 270 insertions(+), 171 deletions(-) create mode 100644 src/fusor/harvester.py create mode 100644 tests/test_harvesters.py diff --git a/src/fusor/fusion_caller_models.py b/src/fusor/fusion_caller_models.py index a331429..faffb21 100644 --- a/src/fusor/fusion_caller_models.py +++ b/src/fusor/fusion_caller_models.py @@ -1,9 +1,7 @@ """Schemas for outputs provided by different fusion callers""" -import csv -from abc import ABC, abstractmethod +from abc import ABC from enum import Enum -from pathlib import Path from typing import Literal from pydantic import BaseModel, ConfigDict, Field @@ -29,46 +27,6 @@ class FusionCaller(ABC, BaseModel): type: Caller model_config = ConfigDict(extra="allow") - @staticmethod - def _does_file_exist(path: Path) -> None: - """Check if fusions file exists - - :param path: The path to the file - :return None - :raise ValueError if the file does not exist at the specified path - """ - if not path.exists(): - statement = f"{path!s} does not exist" - raise ValueError(statement) - return - - @classmethod - def _process_fusion_caller_rows( - cls, - path: Path, - column_rename: dict, - delimeter: str, - ) -> list["FusionCaller"]: - """Convert rows of fusion caller output to Pydantic classes - - :param path: The path to the fusions file - :param column_rename: A dictionary of column mappings - :param delimeter: The delimeter for the fusions file - :return: A list of fusions, represented as Pydantic objects - """ - cls._does_file_exist(path) - fusions_list = [] - with path.open() as csvfile: - reader = csv.DictReader(csvfile, delimiter=delimeter) - for row in reader: - row = {column_rename.get(key, key): value for key, value in row.items()} - fusions_list.append(cls(**row)) - return fusions_list - - @abstractmethod - def load_records(self, path: Path) -> list["FusionCaller"]: - """Abstract method to load records from a fusion caller file.""" - class JAFFA(FusionCaller): """Define parameters for JAFFA model""" @@ -108,20 +66,6 @@ class JAFFA(FusionCaller): description="The number of detected reads that align entirely on either side of the breakpoint", ) - @classmethod - def load_records(cls, path: Path) -> list["JAFFA"]: - """Load fusions from JAFFA csv file - - :param path: The path to the file of JAFFA fusions - :return A list of JAFFA objects, or None if the specified file does not exist - """ - column_rename = { - "fusion genes": "fusion_genes", - "spanning reads": "spanning_reads", - "spanning pairs": "spanning_pairs", - } - return cls._process_fusion_caller_rows(path, column_rename, ",") - class STARFusion(FusionCaller): """Define parameters for STAR-Fusion model""" @@ -147,23 +91,6 @@ class STARFusion(FusionCaller): description="The number of RNA-seq fragments that encompass the fusion junction such that one read of the pair aligns to a different gene than the other paired-end read of that fragment (from STAR-Fusion documentation)", ) - @classmethod - def load_records(cls, path: Path) -> list["STARFusion"]: - """Load fusions from STAR-Fusion tsv file - - :param path: The path to the file of STAR-Fusion fusions - :return A list of STAR-Fusion objects, or None if the specified file does not exist - """ - column_rename = { - "LeftGene": "left_gene", - "RightGene": "right_gene", - "LeftBreakpoint": "left_breakpoint", - "RightBreakpoint": "right_breakpoint", - "JunctionReadCount": "junction_read_count", - "SpanningFragCount": "spanning_frag_count", - } - return cls._process_fusion_caller_rows(path, column_rename, "\t") - class FusionCatcher(FusionCaller): """Define parameters for FusionCatcher model""" @@ -197,25 +124,6 @@ class FusionCatcher(FusionCaller): ..., description="The inferred sequence around the fusion junction" ) - @classmethod - def load_records(cls, path: Path) -> list["FusionCatcher"]: - """Load fusions from FusionCatcher txt file - - :param path: The path to the file of FusionCatcher fusions - :return A list of FusionCatcher objects, or None if the specified file does not exist - """ - column_rename = { - "Gene_1_symbol(5end_fusion_partner)": "five_prime_partner", - "Gene_2_symbol(3end_fusion_partner)": "three_prime_partner", - "Fusion_point_for_gene_1(5end_fusion_partner)": "five_prime_fusion_point", - "Fusion_point_for_gene_2(3end_fusion_partner)": "three_prime_fusion_point", - "Predicted_effect": "predicted_effect", - "Spanning_unique_reads": "spanning_unique_reads", - "Spanning_pairs": "spanning_reads", - "Fusion_sequence": "fusion_sequence", - } - return cls._process_fusion_caller_rows(path, column_rename, "\t") - class Arriba(FusionCaller): """Define parameters for Arriba model""" @@ -266,22 +174,6 @@ class Arriba(FusionCaller): ) fusion_transcript: str = Field(..., description="The assembled fusion transcript") - @classmethod - def load_records(cls, path: Path) -> list["Arriba"]: - """Load fusions from Arriba tsv file - - :param path: The path to the file of Arriba fusions - :return A list of Arriba objects, or None if the specified file does not exist - """ - column_rename = { - "#gene1": "gene1", - "strand1(gene/fusion)": "strand1", - "strand2(gene/fusion)": "strand2", - "type": "event_type", - "reading_frame": "rf", - } - return cls._process_fusion_caller_rows(path, column_rename, "\t") - class Cicero(FusionCaller): """Define parameters for CICERO model""" @@ -321,28 +213,6 @@ class Cicero(FusionCaller): ) contig: str = Field(..., description="The assembled contig sequence for the fusion") - @classmethod - def load_records(cls, path: Path) -> list["Cicero"]: - """Load fusions from Cicero txt file - - :param path: The path to the file of Cicero fusions - :return A list of Cicero objects, or None if the specified file does not exist - """ - column_rename = { - "geneA": "gene_5prime", - "geneB": "gene_3prime", - "chrA": "chr_5prime", - "chrB": "chr_3prime", - "posA": "pos_5prime", - "posB": "pos_3prime", - "type": "event_type", - "readsA": "reads_5prime", - "readsB": "reads_3prime", - "coverageA": "coverage_5prime", - "coverageB": "coverage_3prime", - } - return cls._process_fusion_caller_rows(path, column_rename, "\t") - class EnFusion(FusionCaller): """Define parameters for EnFusion model""" @@ -362,24 +232,6 @@ class EnFusion(FusionCaller): None, description="The sequence near the fusion junction" ) - @classmethod - def load_records(cls, path: Path) -> list["EnFusion"]: - """Load fusions from EnFusion tsv file - - :param path: The path to the file of Enfusion fusions - :return A list of Enfusion objects, or None if the specified file does not exist - """ - column_rename = { - "Gene1": "gene_5prime", - "Gene2": "gene_3prime", - "Chr1": "chr_5prime", - "Chr2": "chr_3prime", - "Break1": "break_5prime", - "Break2": "break_3prime", - "FusionJunctionSequence": "fusion_junction_sequence", - } - return cls._process_fusion_caller_rows(path, column_rename, "\t") - class Genie(FusionCaller): """Define parameters for Genie model""" @@ -395,22 +247,3 @@ class Genie(FusionCaller): reading_frame: str = Field( ..., description="The reading frame status of the fusion" ) - - @classmethod - def load_records(cls, path: Path) -> list["Genie"]: - """Load fusions from Genie txt file - - :param path: The path to the file of Genie structural variants - :return A list of Genie objects, or None if the specified file does not exist - """ - column_rename = { - "Site1_Hugo_Symbol": "site1_hugo", - "Site2_Hugo_Symbol": "site2_hugo", - "Site1_Chromosome": "site1_chrom", - "Site2_Chromosome": "site2_chrom", - "Site1_Position": "site1_pos", - "Site2_Position": "site2_pos", - "Site2_Effect_On_Frame": "reading_frame", - "Annotation": "annot", - } - return cls._process_fusion_caller_rows(path, column_rename, "\t") diff --git a/src/fusor/harvester.py b/src/fusor/harvester.py new file mode 100644 index 0000000..a4e3006 --- /dev/null +++ b/src/fusor/harvester.py @@ -0,0 +1,166 @@ +"""Harvester methods for output from different fusion callers""" + +import csv +from abc import ABC +from pathlib import Path +from typing import ClassVar + +from fusor.fusion_caller_models import ( + JAFFA, + Arriba, + Cicero, + EnFusion, + FusionCaller, + FusionCatcher, + Genie, + STARFusion, +) + + +class FusionCallerHarvester(ABC): + """ABC for fusion caller harvesters""" + + fusion_caller: FusionCaller + column_rename: dict + delimeter: str + + def load_records( + self, + fusion_path: Path, + ) -> list[FusionCaller]: + """Convert rows of fusion caller output to Pydantic classes + + :param path: The path to the fusions file + :param column_rename: A dictionary of column mappings + :param delimeter: The delimeter for the fusions file + :raise ValueError: if the file does not exist at the specified path + :return: A list of fusions, represented as Pydantic objects + """ + if not fusion_path.exists(): + statement = f"{fusion_path!s} does not exist" + raise ValueError(statement) + fusions_list = [] + fields_to_keep = self.fusion_caller.__annotations__.keys() + with fusion_path.open() as csvfile: + reader = csv.DictReader(csvfile, delimiter=self.delimeter) + for row in reader: + row = { + self.column_rename.get(key, key): value + for key, value in row.items() + } + filered_row = { + key: value for key, value in row.items() if key in fields_to_keep + } + fusions_list.append(self.fusion_caller(**filered_row)) + return fusions_list + + +class JAFFAHarvester(FusionCallerHarvester): + """Class for harvesting JAFFA data""" + + column_rename: ClassVar[dict] = { + "fusion genes": "fusion_genes", + "spanning reads": "spanning_reads", + "spanning pairs": "spanning_pairs", + } + delimeter = "," + fusion_caller = JAFFA + + +class StarFusionHarvester(FusionCallerHarvester): + """Class for harvesting STAR-Fusion data""" + + column_rename: ClassVar[dict] = { + "LeftGene": "left_gene", + "RightGene": "right_gene", + "LeftBreakpoint": "left_breakpoint", + "RightBreakpoint": "right_breakpoint", + "JunctionReadCount": "junction_read_count", + "SpanningFragCount": "spanning_frag_count", + } + delimeter = "\t" + fusion_caller = STARFusion + + +class FusionCatcherHarvester(FusionCallerHarvester): + """Class for harvesting FusionCatcher data""" + + column_rename: ClassVar[dict] = { + "Gene_1_symbol(5end_fusion_partner)": "five_prime_partner", + "Gene_2_symbol(3end_fusion_partner)": "three_prime_partner", + "Fusion_point_for_gene_1(5end_fusion_partner)": "five_prime_fusion_point", + "Fusion_point_for_gene_2(3end_fusion_partner)": "three_prime_fusion_point", + "Predicted_effect": "predicted_effect", + "Spanning_unique_reads": "spanning_unique_reads", + "Spanning_pairs": "spanning_reads", + "Fusion_sequence": "fusion_sequence", + } + delimeter = "\t" + fusion_caller = FusionCatcher + + +class ArribaHarvester(FusionCallerHarvester): + """Class for harvesting Arriba data""" + + column_rename: ClassVar[dict] = { + "#gene1": "gene1", + "strand1(gene/fusion)": "strand1", + "strand2(gene/fusion)": "strand2", + "type": "event_type", + "reading_frame": "rf", + } + delimeter = "\t" + fusion_caller = Arriba + + +class CiceroHarvester(FusionCallerHarvester): + """Class for harvesting Cicero data""" + + column_rename: ClassVar[dict] = { + "geneA": "gene_5prime", + "geneB": "gene_3prime", + "chrA": "chr_5prime", + "chrB": "chr_3prime", + "posA": "pos_5prime", + "posB": "pos_3prime", + "type": "event_type", + "readsA": "reads_5prime", + "readsB": "reads_3prime", + "coverageA": "coverage_5prime", + "coverageB": "coverage_3prime", + } + delimeter = "\t" + fusion_caller = Cicero + + +class EnFusionHarvester(FusionCallerHarvester): + """Class for harvesting EnFusion data""" + + column_rename: ClassVar[dict] = { + "Gene1": "gene_5prime", + "Gene2": "gene_3prime", + "Chr1": "chr_5prime", + "Chr2": "chr_3prime", + "Break1": "break_5prime", + "Break2": "break_3prime", + "FusionJunctionSequence": "fusion_junction_sequence", + } + delimeter = "\t" + fusion_caller = EnFusion + + +class GenieHarvester(FusionCallerHarvester): + """Class for harvesting Genie data""" + + column_rename: ClassVar[dict] = { + "Site1_Hugo_Symbol": "site1_hugo", + "Site2_Hugo_Symbol": "site2_hugo", + "Site1_Chromosome": "site1_chrom", + "Site2_Chromosome": "site2_chrom", + "Site1_Position": "site1_pos", + "Site2_Position": "site2_pos", + "Site2_Effect_On_Frame": "reading_frame", + "Annotation": "annot", + } + delimeter = "\t" + fusion_caller = Genie diff --git a/tests/test_fusion_caller_models.py b/tests/test_fusion_caller_models.py index 5a2cea5..4f46470 100644 --- a/tests/test_fusion_caller_models.py +++ b/tests/test_fusion_caller_models.py @@ -5,7 +5,6 @@ import pytest from fusor.fusion_caller_models import ( - JAFFA, Arriba, Cicero, EnFusion, @@ -13,17 +12,19 @@ Genie, STARFusion, ) +from fusor.harvester import JAFFAHarvester def test_get_jaffa_records(fixture_data_dir): """Test that get_jaffa_records works correctly""" path = Path(fixture_data_dir / "jaffa_results.csv") - records = JAFFA.load_records(path) + # records = JAFFA.load_records(path) + records = JAFFAHarvester.load_records(path) assert len(records) == 491 path = Path(fixture_data_dir / "jaffa_resultss.csv") with pytest.raises(ValueError, match=f"{path} does not exist"): - assert JAFFA.load_records(path) + assert JAFFAHarvester.load_records(path) def test_get_star_fusion_records(fixture_data_dir): diff --git a/tests/test_harvesters.py b/tests/test_harvesters.py new file mode 100644 index 0000000..5860b9e --- /dev/null +++ b/tests/test_harvesters.py @@ -0,0 +1,99 @@ +"""Module for testing fusion caller classes""" + +from pathlib import Path + +import pytest + +from fusor.harvester import ( + ArribaHarvester, + CiceroHarvester, + EnFusionHarvester, + FusionCatcherHarvester, + GenieHarvester, + JAFFAHarvester, + StarFusionHarvester, +) + + +def test_get_jaffa_records(fixture_data_dir): + """Test that get_jaffa_records works correctly""" + path = Path(fixture_data_dir / "jaffa_results.csv") + harvester = JAFFAHarvester() + records = harvester.load_records(path) + assert len(records) == 491 + + path = Path(fixture_data_dir / "jaffa_resultss.csv") + with pytest.raises(ValueError, match=f"{path} does not exist"): + assert harvester.load_records(path) + + +def test_get_star_fusion_records(fixture_data_dir): + """Test that get_star_fusion_records works correctly""" + path = Path(fixture_data_dir / "star-fusion.fusion_predictions.abridged.tsv") + harvester = StarFusionHarvester() + records = harvester.load_records(path) + assert len(records) == 37 + + path = Path(fixture_data_dir / "star-fusion.fusion_predictions.abridged.tsvs") + with pytest.raises(ValueError, match=f"{path} does not exist"): + assert harvester.load_records(path) + + +def test_get_fusion_catcher_records(fixture_data_dir): + """Test that get_fusion_catcher_records works correctly""" + path = Path(fixture_data_dir / "final-list_candidate-fusion-genes.txt") + harvester = FusionCatcherHarvester() + fusions_list = harvester.load_records(path) + assert len(fusions_list) == 355 + + path = Path(fixture_data_dir / "final-list_candidate-fusion-genes.txts") + with pytest.raises(ValueError, match=f"{path} does not exist"): + assert harvester.load_records(path) + + +def test_get_arriba_records(fixture_data_dir): + """Test that get_arriba_records works correctly""" + path = Path(fixture_data_dir / "fusions_arriba_test.tsv") + harvester = ArribaHarvester() + fusions_list = harvester.load_records(path) + assert len(fusions_list) == 1 + + path = Path(fixture_data_dir / "fusionsd_arriba_test.tsv") + with pytest.raises(ValueError, match=f"{path} does not exist"): + assert harvester.load_records(path) + + +def test_get_cicero_records(fixture_data_dir): + """Test that get_cicero_records works correctly""" + path = Path(fixture_data_dir / "annotated.fusion.txt") + harvester = CiceroHarvester() + fusions_list = harvester.load_records(path) + assert len(fusions_list) == 1 + + path = Path(fixture_data_dir / "annnotated.fusion.txt") + with pytest.raises(ValueError, match=f"{path} does not exist"): + assert harvester.load_records(path) + + +def test_get_enfusion_records(fixture_data_dir): + """Test that get_enfusion_records works correctly""" + path = Path(fixture_data_dir / "enfusion_test.csv") + harvester = EnFusionHarvester() + fusions_list = harvester.load_records(path) + assert len(fusions_list) == 1 + + path = Path(fixture_data_dir / "enfusions_test.csv") + with pytest.raises(ValueError, match=f"{path} does not exist"): + assert harvester.load_records(path) + + +def test_get_genie_records(fixture_data_dir): + """Test that get_genie_records works correctly""" + path = Path(fixture_data_dir / "genie_test.txt") + harvester = GenieHarvester() + fusions_list = harvester.load_records(path) + assert len(fusions_list) == 1 + + path = Path(fixture_data_dir / "genie_tests.txt") + with pytest.raises(ValueError, match=f"{path} does not exist"): + assert harvester.load_records(path) From 3af24151b6f11b14c851269928f3789679cb713a Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld Date: Fri, 24 Jan 2025 16:30:49 -0500 Subject: [PATCH 19/20] Remove fusion caller model tests --- tests/test_fusion_caller_models.py | 93 ------------------------------ 1 file changed, 93 deletions(-) delete mode 100644 tests/test_fusion_caller_models.py diff --git a/tests/test_fusion_caller_models.py b/tests/test_fusion_caller_models.py deleted file mode 100644 index 4f46470..0000000 --- a/tests/test_fusion_caller_models.py +++ /dev/null @@ -1,93 +0,0 @@ -"""Module for testing fusion caller classes""" - -from pathlib import Path - -import pytest - -from fusor.fusion_caller_models import ( - Arriba, - Cicero, - EnFusion, - FusionCatcher, - Genie, - STARFusion, -) -from fusor.harvester import JAFFAHarvester - - -def test_get_jaffa_records(fixture_data_dir): - """Test that get_jaffa_records works correctly""" - path = Path(fixture_data_dir / "jaffa_results.csv") - # records = JAFFA.load_records(path) - records = JAFFAHarvester.load_records(path) - assert len(records) == 491 - - path = Path(fixture_data_dir / "jaffa_resultss.csv") - with pytest.raises(ValueError, match=f"{path} does not exist"): - assert JAFFAHarvester.load_records(path) - - -def test_get_star_fusion_records(fixture_data_dir): - """Test that get_star_fusion_records works correctly""" - path = Path(fixture_data_dir / "star-fusion.fusion_predictions.abridged.tsv") - records = STARFusion.load_records(path) - assert len(records) == 37 - - path = Path(fixture_data_dir / "star-fusion.fusion_predictions.abridged.tsvs") - with pytest.raises(ValueError, match=f"{path} does not exist"): - assert STARFusion.load_records(path) - - -def test_get_fusion_catcher_records(fixture_data_dir): - """Test that get_fusion_catcher_records works correctly""" - path = Path(fixture_data_dir / "final-list_candidate-fusion-genes.txt") - fusions_list = FusionCatcher.load_records(path) - assert len(fusions_list) == 355 - - path = Path(fixture_data_dir / "final-list_candidate-fusion-genes.txts") - with pytest.raises(ValueError, match=f"{path} does not exist"): - assert FusionCatcher.load_records(path) - - -def test_get_arriba_records(fixture_data_dir): - """Test that get_arriba_records works correctly""" - path = Path(fixture_data_dir / "fusions_arriba_test.tsv") - fusions_list = Arriba.load_records(path) - assert len(fusions_list) == 1 - - path = Path(fixture_data_dir / "fusionsd_arriba_test.tsv") - with pytest.raises(ValueError, match=f"{path} does not exist"): - assert Arriba.load_records(path) - - -def test_get_cicero_records(fixture_data_dir): - """Test that get_cicero_records works correctly""" - path = Path(fixture_data_dir / "annotated.fusion.txt") - fusions_list = Cicero.load_records(path) - assert len(fusions_list) == 1 - - path = Path(fixture_data_dir / "annnotated.fusion.txt") - with pytest.raises(ValueError, match=f"{path} does not exist"): - assert Cicero.load_records(path) - - -def test_get_enfusion_records(fixture_data_dir): - """Test that get_enfusion_records works correctly""" - path = Path(fixture_data_dir / "enfusion_test.csv") - fusions_list = EnFusion.load_records(path) - assert len(fusions_list) == 1 - - path = Path(fixture_data_dir / "enfusions_test.csv") - with pytest.raises(ValueError, match=f"{path} does not exist"): - assert EnFusion.load_records(path) - - -def test_get_genie_records(fixture_data_dir): - """Test that get_genie_records works correctly""" - path = Path(fixture_data_dir / "genie_test.txt") - fusions_list = Genie.load_records(path) - assert len(fusions_list) == 1 - - path = Path(fixture_data_dir / "genie_tests.txt") - with pytest.raises(ValueError, match=f"{path} does not exist"): - assert Genie.load_records(path) From 270205b71f86da8b60c0e2137c83d146e708fc9a Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld Date: Mon, 27 Jan 2025 09:59:10 -0500 Subject: [PATCH 20/20] Change docstrings, remove double loop --- src/fusor/harvester.py | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/src/fusor/harvester.py b/src/fusor/harvester.py index a4e3006..20a42cb 100644 --- a/src/fusor/harvester.py +++ b/src/fusor/harvester.py @@ -30,7 +30,7 @@ def load_records( ) -> list[FusionCaller]: """Convert rows of fusion caller output to Pydantic classes - :param path: The path to the fusions file + :param fusion_path: The path to the fusions file :param column_rename: A dictionary of column mappings :param delimeter: The delimeter for the fusions file :raise ValueError: if the file does not exist at the specified path @@ -40,25 +40,23 @@ def load_records( statement = f"{fusion_path!s} does not exist" raise ValueError(statement) fusions_list = [] - fields_to_keep = self.fusion_caller.__annotations__.keys() + fields_to_keep = self.fusion_caller.__annotations__ with fusion_path.open() as csvfile: reader = csv.DictReader(csvfile, delimiter=self.delimeter) for row in reader: - row = { - self.column_rename.get(key, key): value - for key, value in row.items() - } - filered_row = { - key: value for key, value in row.items() if key in fields_to_keep - } - fusions_list.append(self.fusion_caller(**filered_row)) + filtered_row = {} + for key, value in row.items(): + renamed_key = self.column_rename.get(key, key) + if renamed_key in fields_to_keep: + filtered_row[renamed_key] = value + fusions_list.append(self.fusion_caller(**filtered_row)) return fusions_list class JAFFAHarvester(FusionCallerHarvester): """Class for harvesting JAFFA data""" - column_rename: ClassVar[dict] = { + column_rename: ClassVar[dict[str, str]] = { "fusion genes": "fusion_genes", "spanning reads": "spanning_reads", "spanning pairs": "spanning_pairs", @@ -70,7 +68,7 @@ class JAFFAHarvester(FusionCallerHarvester): class StarFusionHarvester(FusionCallerHarvester): """Class for harvesting STAR-Fusion data""" - column_rename: ClassVar[dict] = { + column_rename: ClassVar[dict[str, str]] = { "LeftGene": "left_gene", "RightGene": "right_gene", "LeftBreakpoint": "left_breakpoint", @@ -85,7 +83,7 @@ class StarFusionHarvester(FusionCallerHarvester): class FusionCatcherHarvester(FusionCallerHarvester): """Class for harvesting FusionCatcher data""" - column_rename: ClassVar[dict] = { + column_rename: ClassVar[dict[str, str]] = { "Gene_1_symbol(5end_fusion_partner)": "five_prime_partner", "Gene_2_symbol(3end_fusion_partner)": "three_prime_partner", "Fusion_point_for_gene_1(5end_fusion_partner)": "five_prime_fusion_point", @@ -102,7 +100,7 @@ class FusionCatcherHarvester(FusionCallerHarvester): class ArribaHarvester(FusionCallerHarvester): """Class for harvesting Arriba data""" - column_rename: ClassVar[dict] = { + column_rename: ClassVar[dict[str, str]] = { "#gene1": "gene1", "strand1(gene/fusion)": "strand1", "strand2(gene/fusion)": "strand2", @@ -116,7 +114,7 @@ class ArribaHarvester(FusionCallerHarvester): class CiceroHarvester(FusionCallerHarvester): """Class for harvesting Cicero data""" - column_rename: ClassVar[dict] = { + column_rename: ClassVar[dict[str, str]] = { "geneA": "gene_5prime", "geneB": "gene_3prime", "chrA": "chr_5prime", @@ -136,7 +134,7 @@ class CiceroHarvester(FusionCallerHarvester): class EnFusionHarvester(FusionCallerHarvester): """Class for harvesting EnFusion data""" - column_rename: ClassVar[dict] = { + column_rename: ClassVar[dict[str, str]] = { "Gene1": "gene_5prime", "Gene2": "gene_3prime", "Chr1": "chr_5prime", @@ -152,7 +150,7 @@ class EnFusionHarvester(FusionCallerHarvester): class GenieHarvester(FusionCallerHarvester): """Class for harvesting Genie data""" - column_rename: ClassVar[dict] = { + column_rename: ClassVar[dict[str, str]] = { "Site1_Hugo_Symbol": "site1_hugo", "Site2_Hugo_Symbol": "site2_hugo", "Site1_Chromosome": "site1_chrom",