diff --git a/src/metakb/transformers/base.py b/src/metakb/transformers/base.py index 163cf4a5..738de3ff 100644 --- a/src/metakb/transformers/base.py +++ b/src/metakb/transformers/base.py @@ -7,7 +7,7 @@ from abc import ABC, abstractmethod from enum import Enum from pathlib import Path -from typing import ClassVar +from typing import ClassVar, TypeVar from disease.schemas import ( SYSTEM_URI_TO_NAMESPACE as DISEASE_SYSTEM_URI_TO_NAMESPACE, @@ -55,6 +55,8 @@ NormalizedGene: "gene", } +_CacheType = TypeVar("_CacheType", bound="_TransformedRecordsCache") + def _sanitize_name(name: str) -> str: """Trim leading and trailing whitespace and replace whitespace characters with @@ -289,7 +291,7 @@ def __init__( :param Optional[Path] harvester_path: Path to previously harvested data :param ViccNormalizers normalizers: normalizer collection instance """ - self._cache: _TransformedRecordsCache + self._cache = self._create_cache() self.name = self.__class__.__name__.lower().split("transformer")[0] self.data_dir = data_dir / self.name self.harvester_path = harvester_path @@ -308,6 +310,10 @@ async def transform(self, harvested_data: _HarvestedData) -> None: :param harvested_data: Source harvested data """ + @abstractmethod + def _create_cache() -> _CacheType: + """Create cache for transformed records""" + def extract_harvested_data(self) -> _HarvestedData: """Get harvested data from file. diff --git a/src/metakb/transformers/civic.py b/src/metakb/transformers/civic.py index f4d8bbe8..088cae0b 100644 --- a/src/metakb/transformers/civic.py +++ b/src/metakb/transformers/civic.py @@ -143,7 +143,7 @@ class _VariationCache(BaseModel): aliases: list[Extension] | None = None coordinates: dict | None = None members: list[Variation] | None = None - extensions: list[Extension] = [] + extensions: list[Extension] | None = None class SourcePrefix(str, Enum): @@ -192,7 +192,11 @@ def __init__( self.processed_data.methods = [ self.methods_mapping[MethodId.CIVIC_EID_SOP.value] ] - self._cache = _CivicTransformedCache() + self._cache = self._create_cache() + + def _create_cache(self) -> _CivicTransformedCache: + """Create cache for transformed records""" + return _CivicTransformedCache() @staticmethod def _mp_to_variant_mapping(molecular_profiles: list[dict]) -> tuple[list, dict]: @@ -611,7 +615,7 @@ def _is_supported_variant_query(variant_name: str, variant_id: int) -> bool: return True - async def _get_variation_members(self, variant: dict) -> list[Variation] | None: + async def _get_variation_members(self, variant: dict) -> list[Variation]: """Get members field for variation object. This is the related variant concepts. :param variant: CIViC Variant record @@ -701,7 +705,7 @@ async def _add_variations(self, variants: list[dict]) -> None: label="_".join(vt["name"].lower().split()), ) for vt in variant["variant_types"] - if vt and vt["url"] + if vt and vt["url"] # system is required ] # Get mappings diff --git a/src/metakb/transformers/moa.py b/src/metakb/transformers/moa.py index 80df3c0e..f40b19e8 100644 --- a/src/metakb/transformers/moa.py +++ b/src/metakb/transformers/moa.py @@ -76,7 +76,11 @@ def __init__( self.processed_data.methods = [ self.methods_mapping[MethodId.MOA_ASSERTION_BIORXIV.value] ] - self._cache = _MoaTransformedCache() + self._cache = self._create_cache() + + def _create_cache(self) -> _MoaTransformedCache: + """Create cache for transformed records""" + return _MoaTransformedCache() async def transform(self, harvested_data: MoaHarvestedData) -> None: """Transform MOA harvested JSON to common data model. Will store transformed @@ -223,10 +227,11 @@ async def _add_categorical_variants(self, variants: list[dict]) -> None: moa_variation = None gene = variant.get("gene") or variant.get("gene1") moa_gene = self._cache.genes[_sanitize_name(gene)] + protein_change = variant.get("protein_change") constraints = None extensions = [] - if "rearrangement_type" in variant: + if "rearrangement_type" in variant or not protein_change: logger.debug( "Variation Normalizer does not support %s: %s", moa_variant_id, @@ -237,37 +242,24 @@ async def _add_categorical_variants(self, variants: list[dict]) -> None: # Form query and run through variation-normalizer # For now, the normalizer only support amino acid substitution vrs_variation = None - if variant.get("protein_change"): - gene = moa_gene.label - query = f"{gene} {variant['protein_change'][2:]}" - vrs_variation = await self.vicc_normalizers.normalize_variation( - [query] - ) - - if not vrs_variation: - logger.debug( - "Variation Normalizer unable to normalize: moa.variant: %s using query: %s", - variant_id, - query, - ) - extensions.append(self._get_vicc_normalizer_failure_ext()) - else: - # Create VRS Variation object - params = vrs_variation.model_dump(exclude_none=True) - moa_variant_id = f"moa.variant:{variant_id}" - params["id"] = vrs_variation.id - moa_variation = Variation(**params) - constraints = [ - DefiningAlleleConstraint(allele=moa_variation.root) - ] + gene = moa_gene.label + query = f"{gene} {protein_change[2:]}" + vrs_variation = await self.vicc_normalizers.normalize_variation([query]) - else: + if not vrs_variation: logger.debug( - "Variation Normalizer does not support %s: %s", - moa_variant_id, - feature, + "Variation Normalizer unable to normalize: moa.variant: %s using query: %s", + variant_id, + query, ) extensions.append(self._get_vicc_normalizer_failure_ext()) + else: + # Create VRS Variation object + params = vrs_variation.model_dump(exclude_none=True) + moa_variant_id = f"moa.variant:{variant_id}" + params["id"] = vrs_variation.id + moa_variation = Variation(**params) + constraints = [DefiningAlleleConstraint(allele=moa_variation.root)] # Add MOA representative coordinate data to extensions coordinates_keys = [