Skip to content

Commit

Permalink
Hopefully solved an issue with protein mapping raising error when there
Browse files Browse the repository at this point in the history
ambigious amino acids
  • Loading branch information
yaccos committed Nov 20, 2020
1 parent be1c188 commit 5a069a2
Showing 1 changed file with 10 additions and 9 deletions.
19 changes: 10 additions & 9 deletions autopacmen/submodules/get_protein_mass_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
import cobra
import requests
import time
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from typing import Dict, List
# Internal modules
from .helper_general import ensure_folder_existence, get_files, json_write, pickle_write, pickle_load, standardize_folder
Expand All @@ -33,7 +32,7 @@
def get_protein_mass_mapping(model: cobra.Model, project_folder: str, project_name: str) -> None:
"""Returns a JSON with a mapping of protein IDs as keys, and as values the protein mass in kDa.
The protein masses are calculated using the amino acid sequence from UniProt (retrieved using
The protein masses are taken from UniProt (retrieved using
UniProt's REST API).
Arguments
Expand Down Expand Up @@ -114,7 +113,7 @@ def get_protein_mass_mapping(model: cobra.Model, project_folder: str, project_na
# With 'OR', all given IDs are searched, and subsequently in this script,
# the right associated masses are being picked.
query = " OR ".join(batch)
uniprot_query_url = f"https://www.uniprot.org/uniprot/?query={query}&format=tab&columns=id,sequence"
uniprot_query_url = f"https://www.uniprot.org/uniprot/?query={query}&format=tab&columns=id,mass"
print(f"UniProt batch search for: {query}")

# Call UniProt's API :-)
Expand All @@ -127,18 +126,20 @@ def get_protein_mass_mapping(model: cobra.Model, project_folder: str, project_na
if line == "":
continue
uniprot_id = line.split("\t")[0]
sequence = line.split("\t")[1]
# Get the protein mass using biopython's associated function for amino acid sequences
mass_string = line.split("\t")[1]
try:
mass = ProteinAnalysis(sequence, monoisotopic=False).molecular_weight()
except ValueError: # e.g. if an "X" is in a sequence
# Note that the mass entry from UniProt uses a comma as a thousand separator, so it has to be removed before parsing
mass = float(mass_string.replace(",",""))
except ValueError: # We may also risk the entry is missing
# print(f"No protein mass obtainable for protein ID {uniprot_id}")
continue
uniprot_id_protein_mass_mapping[uniprot_id] = float(mass)

# Create the pickled cache files for the searched protein masses
for uniprot_id in batch:
cache_filepath = cache_basepath + uniprot_id
pickle_write(cache_filepath, uniprot_id_protein_mass_mapping[uniprot_id])
if uniprot_id in uniprot_id_protein_mass_mapping: # Takes into account that we may fail to obtain a UniProt ID
cache_filepath = cache_basepath + uniprot_id
pickle_write(cache_filepath, uniprot_id_protein_mass_mapping[uniprot_id])

# Continue with the next batch :D
batch_start += batch_size
Expand Down

0 comments on commit 5a069a2

Please sign in to comment.