diff --git a/cgsmiles/cgsmiles_utils.py b/cgsmiles/cgsmiles_utils.py index ef723da..6863fe0 100644 --- a/cgsmiles/cgsmiles_utils.py +++ b/cgsmiles/cgsmiles_utils.py @@ -1,5 +1,6 @@ from collections import defaultdict import networkx as nx +from .read_cgsmiles import read_cgsmiles def find_complementary_bonding_descriptor(bonding_descriptor, ellegible_descriptors=None): """ @@ -64,3 +65,35 @@ def find_open_bonds(molecule, target_nodes=None): for bonding_types in bonding_types: open_bonds_by_descriptor[bonding_types].append(node) return open_bonds_by_descriptor + +def read_fragment_cgsmiles(cgsmiles_str, + fragname, + bonding_descrpt={}, + attributes={}): + """ + Read a smiles_str corresponding to a CGSmiles fragment and + annotate bonding descriptors, isomers, as well as any other + attributes. + + Parameters + ---------- + smiles_str: str + string in CGSmiles format + fragname: str + the name of the fragment + attributes: dict + + Returns + ------- + nx.Graph + the graph of the molecular fragment + """ + mol_graph = read_cgsmiles(cgsmiles_str) + fragnames = nx.get_node_attributes(mol_graph, 'fragname') + nx.set_node_attributes(mol_graph, fragnames, 'atomname') + nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding') + nx.set_node_attributes(mol_graph, fragname, 'fragname') + nx.set_node_attributes(mol_graph, 0, 'fragid') + nx.set_node_attributes(mol_graph, 1, 'w') + nx.set_node_attributes(mol_graph, attributes) + return mol_graph diff --git a/cgsmiles/dialects.py b/cgsmiles/dialects.py new file mode 100644 index 0000000..5404c3b --- /dev/null +++ b/cgsmiles/dialects.py @@ -0,0 +1,147 @@ +from inspect import signature, Signature, Parameter +from functools import partial + +def check_and_cast_types(bound_args, signature): + for name, value in bound_args.arguments.items(): + param = signature.parameters.get(name) + # Check if a type annotation is present + if param and param.annotation != Parameter.empty: + expected_type = param.annotation + # Attempt type casting if the value is not of the expected type + if not isinstance(value, expected_type): + try: + bound_args.arguments[name] = expected_type(value) + except (TypeError, ValueError): + raise TypeError(f"Argument '{name}' must be of type {expected_type.__name__}") + return bound_args + +def _parse_dialect_string(string_iterable, + dialect_signature, + arg_to_fullname={}, + annotation_sep_token=';', + annotation_assign_token='='): + """ + This base function parsers a string that describes key value pairs + in having a pattern of: + + keyvaluekey ... + + Default values, non-keyword agruments and types are defined using the + dialect signature object. If args are defined the key and assignment + token may be omitted. + + Neither the `annotation_sep_token` nor the `annotation_assign_token` + can be part of key or value. A SyntaxError is raised in this case. + + Parameters + ---------- + string_iterable: iter + the string or iter object that contains the string + dialect_signature: cls.inspec.Signature + a signature defineing args, kwargs, default values + and types + arg_to_fullname: dict + maps arguments to more verbose descriptions + annotation_sep_token: str + character used to seperate key value pairs + annotation_assign_token: str + character used to assign a key from a value + + Returns + ------- + dict + dict of key value paris + + Raises + ------ + SyntaxError + an error is raised if the signature does not match or + too many annotation_assign_token are given + """ + args_found = [] + kwargs_found = {} + if len(string_iterable) > 0: + elements = string_iterable.split(annotation_sep_token) + for entry in elements: + if entry.count('=') > 1: + # this takes care of too many '=' chacaters + msg = (f"Your annotation {entry} contains too many " + f"{annotation_assign_token} charachters. Only" + "chacracter per key value pair is allowed") + raise SyntaxError(msg) + key_value = entry.split(annotation_assign_token) + + if len(key_value) == 1: + args_found.append(key_value[0]) + else: + kwargs_found[key_value[0]] = key_value[1] + + try: + applied_labels = dialect_signature.bind(*args_found, + **kwargs_found) + except TypeError as emsg: + print(emsg) + msg = ("You have too many positional arguments or " + f"{annotation_sep_token} as part of key value " + "pairs which is not allowed.") + raise SyntaxError(msg) + + applied_labels = check_and_cast_types(applied_labels, + dialect_signature) + applied_labels.apply_defaults() + # convert keys to more verbose names + # this should only apply to args know to + # the signature + for old_key, new_key in arg_to_fullname.items(): + if old_key in applied_labels.arguments: + applied_labels.arguments[new_key] = applied_labels.arguments.pop(old_key) + + # if there are kwargs we need to put them into + # output dict + out_args = {} + if 'kwargs' in applied_labels.arguments: + out_args.update(applied_labels.arguments['kwargs']) + del applied_labels.arguments['kwargs'] + out_args.update(applied_labels.arguments) + return out_args + +def create_dialect(default_attributes, accept_kwargs=True): + """ + Creates a signature of default annotations. + Note that the order of the entries in the dict + determines the order of the args accepted. + """ + parameters = [] + for argname, default_value in default_attributes.items(): + arg_type = type(default_value) + parameters.append(Parameter(argname, + Parameter.POSITIONAL_OR_KEYWORD, + default=default_value, + annotation=arg_type)) + if accept_kwargs: + parameters.append(Parameter('kwargs', + kind=Parameter.VAR_KEYWORD)) + sig = Signature(parameters) + return sig + +########################################################## +# KNOWN DIALECTS # +########################################################## +# this one is for global use +# it is the base CGSmiles dialect +CGSMILES_DEFAULT_DIALECT = create_dialect({"fragname": "NaN", + "q": 0.0, + "w": 1.0}) +parse_graph_base_node = partial(_parse_dialect_string, + dialect_signature=CGSMILES_DEFAULT_DIALECT, + arg_to_fullname = {"w": "weight", "q": "charge"}) +# this one is an internal fukery until the pysmiles +# base parser is available +# it just strips the kwargs from fragments before +# they go to the respective parser +# in case of cgsmiles fragments it is a bit doing +# double the work +fragment_base = create_dialect({"w": 1.0}, accept_kwargs=True) +_fragment_node_parser = partial(_parse_dialect_string, + dialect_signature=fragment_base, + arg_to_fullname = {"w": "weight"}) diff --git a/cgsmiles/graph_utils.py b/cgsmiles/graph_utils.py index ab750fd..0973e51 100644 --- a/cgsmiles/graph_utils.py +++ b/cgsmiles/graph_utils.py @@ -146,7 +146,6 @@ def annotate_fragments(meta_graph, molecule): return meta_graph - def set_atom_names_atomistic(molecule, meta_graph=None): """ Set atomnames according to commonly used convention diff --git a/cgsmiles/pysmiles_utils.py b/cgsmiles/pysmiles_utils.py index 952a353..7a110a1 100644 --- a/cgsmiles/pysmiles_utils.py +++ b/cgsmiles/pysmiles_utils.py @@ -32,7 +32,9 @@ def compute_mass(input_molecule): mass += pysmiles.PTE[element]['AtomicMass'] return mass -def rebuild_h_atoms(mol_graph, keep_bonding=False): +def rebuild_h_atoms(mol_graph, + keep_bonding=False, + copy_attrs=['fragid', 'fragname', 'weight']): """ Helper function which add hydrogen atoms to the molecule graph. @@ -52,17 +54,22 @@ def rebuild_h_atoms(mol_graph, keep_bonding=False): by the number of bonding descriptors. In this way hydrogen atoms can also be added to fragments only. + The `copy_attrs` argument defines a list of attributes to copy + to the newly added hydrogen atoms. In case the hydrogen atoms + are their own fragments attributes are not copied. If an attribute + is already assigned, because the hydrogen atom was explicit that + attribute is not replaced. + Parameters ---------- mol_graph: :class:`nx.Graph` graph describing the full molecule without hydrogen atoms + copy_attrs: list[abc.hashable] + a list of attributes to copy from the parent node to the + hydrogen atom + keep_bonding: bool + adjust hcount for number of bonding descriptors """ - for node in mol_graph.nodes: - - if mol_graph.nodes[node].get('bonding', False) and \ - mol_graph.nodes[node].get('element', '*') == "H": - mol_graph.nodes[node]['single_h_frag'] = True - try: pysmiles.smiles_helper.correct_aromatic_rings(mol_graph, strict=True) except SyntaxError as pysmiles_err: @@ -89,14 +96,14 @@ def rebuild_h_atoms(mol_graph, keep_bonding=False): # now we add the hydrogen atoms pysmiles.smiles_helper.add_explicit_hydrogens(mol_graph) - # if we are having single hydrogen fragments we need to - # make sure the fragid and fragname is keept - for node in mol_graph.nodes: - if mol_graph.nodes[node].get("element", "*") == "H" and\ - not mol_graph.nodes[node].get("single_h_frag", False): - ref_node = next(mol_graph.neighbors(node)) - mol_graph.nodes[node]["fragid"] = mol_graph.nodes[ref_node]["fragid"] - mol_graph.nodes[node]["fragname"] = mol_graph.nodes[ref_node]["fragname"] + for node, element in mol_graph.nodes(data='element'): + if element == "H" and not mol_graph.nodes[node].get("single_h_frag", False): + anchor = next(mol_graph.neighbors(node)) + for attr in copy_attrs: + if attr in mol_graph.nodes[node]: + continue + value = mol_graph.nodes[anchor][attr] + mol_graph.nodes[node][attr] = value def annotate_ez_isomers(molecule): """ @@ -179,3 +186,97 @@ def mark_chiral_atoms(molecule): neighbours = [neighbours[0], neighbours[1], neighbours[3], neighbours[2]] molecule.nodes[node]['rs_isomer'] = tuple(neighbours) + +def read_fragment_smiles(smiles_str, + fragname, + bonding_descrpt={}, + rs_isomers={}, + ez_isomers={}, + attributes={}): + """ + Read a smiles_str corresponding to a CGSmiles fragment and + annotate bonding descriptors, isomers, as well as any other + attributes. + + This function also sets default attributes as follows: + + - fragname to `fragname` + - fragid to 0 + - w to 1 + + Parameters + ---------- + smiles_str: str + string in OpenSMILES format + fragname: str + the name of the fragment + rs_isomers: dict + ez_isomers: dict + attributes: dict + + Returns + ------- + nx.Graph + the graph of the molecular fragment + """ + if smiles_str == 'H': + LOGGER.warning("You define an H fragment, which is not valid SMILES. We'll make it [H].") + smiles_str = '[H]' + + mol_graph = pysmiles.read_smiles(smiles_str, + explicit_hydrogen=True, + reinterpret_aromatic=False, + strict=False) + # set some default values + nx.set_node_attributes(mol_graph, fragname, 'fragname') + nx.set_node_attributes(mol_graph, 0, 'fragid') + nx.set_node_attributes(mol_graph, 1, 'weight') + + # we add all bonding descriptors to the molecule + nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding') + + # set other attributes + nx.set_node_attributes(mol_graph, attributes) + + # set the default atomnames consiting of the element and index + atomnames = {node[0]: node[1]['element']+str(node[0]) for node in mol_graph.nodes(data=True)} + nx.set_node_attributes(mol_graph, atomnames, 'atomname') + + # we have just a single atom so no need for any annotations + if len(mol_graph) == 1: + # we set the hcount for all non-hydrogen elements + if mol_graph.nodes[0]['element'] != 'H': + mol_graph.nodes[0]['hcount'] = 0 + # we tag all single h-atoms + else: + mol_graph.nodes[0]['single_h_frag'] = True + return mol_graph + + # we need to remove hydrogen atoms except when they are having + # attributes; in this case we need to keep them + hatoms = set([n for n, e in mol_graph.nodes(data='element') if e == 'H']) + hatoms_to_keep = set(attributes.keys()) & hatoms + + # temp fix until pysmiles util is imporved + # we set the element to z so they are ignored when pysmiles removes hatoms + nx.set_node_attributes(mol_graph, + dict(zip(hatoms_to_keep, len(hatoms_to_keep)*'z')), + 'element') + + pysmiles.remove_explicit_hydrogens(mol_graph) + + # now we reset the hatoms + nx.set_node_attributes(mol_graph, + dict(zip(hatoms_to_keep, len(hatoms_to_keep)*'H')), + 'element') + + # annotate rs isomers + nx.set_node_attributes(mol_graph, rs_isomers, 'rs_isomer') + + # we need to split countable node keys and the associated value + ez_isomer_atoms = {idx: val[:-1] for idx, val in ez_isomers.items()} + ez_isomer_class = {idx: val[-1] for idx, val in ez_isomers.items()} + nx.set_node_attributes(mol_graph, ez_isomer_atoms, 'ez_isomer_atoms') + nx.set_node_attributes(mol_graph, ez_isomer_class, 'ez_isomer_class') + + return mol_graph diff --git a/cgsmiles/read_cgsmiles.py b/cgsmiles/read_cgsmiles.py index fbbbe4e..aaf52dd 100644 --- a/cgsmiles/read_cgsmiles.py +++ b/cgsmiles/read_cgsmiles.py @@ -2,6 +2,7 @@ import re import numpy as np import networkx as nx +from .dialects import parse_graph_base_node PATTERNS = {"bond_anchor": r"\[\$.*?\]", "place_holder": r"\[\#.*?\]", @@ -31,21 +32,21 @@ def _expand_branch(mol_graph, current, anchor, recipe): anchor: abc.hashable anchor to which to connect current node - recipe: list[(str, int, int)] + recipe: list[(str, int, dict, int)] list storing tuples of node names and - the number of times the node has to be added - and their bond order + the number of times the node has to be added, + a dict of attributes and the bond order Returns ------- nx.Graph """ prev_node = anchor - for bdx, (fragname, n_mon, order) in enumerate(recipe): + for bdx, (n_mon, attributes, order) in enumerate(recipe): if bdx == 0: anchor = current for _ in range(0, n_mon): - mol_graph.add_node(current, fragname=fragname) + mol_graph.add_node(current, **attributes) mol_graph.add_edge(prev_node, current, order=order) prev_node = current @@ -144,7 +145,7 @@ def read_cgsmiles(pattern): # the recipe for making the branch includes the anchor; # which is hence the first residue in the list # at this point the bond order is still 1 unless we have an expansion - recipes[branch_anchor[-1]] = [(mol_graph.nodes[prev_node]['fragname'], 1, 1)] + recipes[branch_anchor[-1]] = [(1, attributes, 1)] # here we check if the atom is followed by a cycle marker # in this case we have an open cycle and close it @@ -215,26 +216,18 @@ def read_cgsmiles(pattern): # the fragname starts at the second character and ends # one before the last according to the above pattern fragname = match.group(0)[2:-1] - # check for charge - charge = 0.0 - for sign in ["+", "-"]: - if sign in fragname: - fragname, charge = fragname.split(sign) - if len(charge) == 0: - charge = float(sign+"1") - else: - charge = float(sign+charge) + # read the annotations + attributes = parse_graph_base_node(fragname) # if this residue is part of a branch we store it in # the recipe dict together with the anchor residue # and expansion number if branching: - recipes[branch_anchor[-1]].append((fragname, n_mon, prev_bond_order)) - + recipes[branch_anchor[-1]].append((n_mon, attributes, prev_bond_order)) # new we add new residue as often as required connection = [] for _ in range(0, n_mon): - mol_graph.add_node(current, fragname=fragname, charge=charge) + mol_graph.add_node(current, **attributes) if prev_node is not None: mol_graph.add_edge(prev_node, current, order=prev_bond_order) diff --git a/cgsmiles/read_fragments.py b/cgsmiles/read_fragments.py index 7fb49a1..3d005e8 100644 --- a/cgsmiles/read_fragments.py +++ b/cgsmiles/read_fragments.py @@ -1,10 +1,17 @@ """ Functions for reading the fragment list. """ +import logging from collections import defaultdict import networkx as nx import pysmiles from .read_cgsmiles import read_cgsmiles +from .dialects import _fragment_node_parser +from .pysmiles_utils import read_fragment_smiles +from .cgsmiles_utils import read_fragment_cgsmiles + +logger = logging.getLogger('pysmiles') +logger.setLevel(level=logging.ERROR) class PeekIter(object): """ @@ -122,6 +129,8 @@ def strip_bonding_descriptors(fragment_string): rings = defaultdict(list) ez_isomer_atoms = {} rs_isomers = {} + attributes = defaultdict(dict) + record_attributes = False smile = "" node_count = 0 prev_node = 0 @@ -148,6 +157,7 @@ def strip_bonding_descriptors(fragment_string): bonding_descrpt[prev_node].append(bond_descrp + str(order)) else: atom = token + attribute_str = "" while peek != ']': # deal with rs chirality if peek == '@': @@ -155,9 +165,20 @@ def strip_bonding_descriptors(fragment_string): if smile_iter.peek() == '@': chiral_token = '@' + next(smile_iter) rs_isomers[node_count] = (chiral_token, []) + # we have weights + elif peek == ';' and not record_attributes: + record_attributes = True + elif record_attributes: + attribute_str += peek else: atom += peek peek = next(smile_iter) + + record_attributes=False + # here we do some post processing cleanup + node_attributes = _fragment_node_parser(attribute_str) + attributes[node_count].update(node_attributes) + smile = smile + atom + "]" prev_node = node_count node_count += 1 @@ -202,7 +223,8 @@ def strip_bonding_descriptors(fragment_string): if node in ring_nodes: bonded_node = _find_bonded_ring_node(ring_nodes, node) rs_isomers[node][1].append(bonded_node) - return smile, bonding_descrpt, rs_isomers, ez_isomer_atoms + + return smile, bonding_descrpt, rs_isomers, ez_isomer_atoms, attributes def fragment_iter(fragment_str, all_atom=True): """ @@ -231,34 +253,22 @@ def fragment_iter(fragment_str, all_atom=True): for fragment in fragment_str[1:-1].split(','): delim = fragment.find('=', 0) fragname = fragment[1:delim] - big_smile = fragment[delim+1:] - smile, bonding_descrpt, rs_isomers, ez_isomers = strip_bonding_descriptors(big_smile) - if smile == "H": - mol_graph = nx.Graph() - mol_graph.add_node(0, element="H", bonding=bonding_descrpt[0]) - nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding') - elif all_atom: - mol_graph = pysmiles.read_smiles(smile, reinterpret_aromatic=False, strict=False) - nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding') - nx.set_node_attributes(mol_graph, rs_isomers, 'rs_isomer') - # we need to split countable node keys and the associated value - ez_isomer_atoms = {idx: val[:-1] for idx, val in ez_isomers.items()} - ez_isomer_class = {idx: val[-1] for idx, val in ez_isomers.items()} - nx.set_node_attributes(mol_graph, ez_isomer_atoms, 'ez_isomer_atoms') - nx.set_node_attributes(mol_graph, ez_isomer_class, 'ez_isomer_class') + frag_smile = fragment[delim+1:] + smiles_str, bonding_descrpt, rs_isomers, ez_isomers, attributes = strip_bonding_descriptors(frag_smile) + # read an all_atom fragment using OpenSMILES definition + if all_atom: + mol_graph = read_fragment_smiles(smiles_str, + fragname, + bonding_descrpt, + rs_isomers, + ez_isomers, + attributes) # we deal with a CG resolution graph else: - mol_graph = read_cgsmiles(smile) - fragnames = nx.get_node_attributes(mol_graph, 'fragname') - nx.set_node_attributes(mol_graph, fragnames, 'atomname') - nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding') - - if all_atom: - atomnames = {node[0]: node[1]['element']+str(node[0]) for node in mol_graph.nodes(data=True)} - nx.set_node_attributes(mol_graph, atomnames, 'atomname') - - nx.set_node_attributes(mol_graph, fragname, 'fragname') - nx.set_node_attributes(mol_graph, 0, 'fragid') + mol_graph = read_fragment_cgsmiles(smiles_str, + fragname, + bonding_descrpt, + attributes) yield fragname, mol_graph def read_fragments(fragment_str, all_atom=True, fragment_dict=None): diff --git a/cgsmiles/tests/test_cgsmile_parsing.py b/cgsmiles/tests/test_cgsmile_parsing.py index 9a59a7b..36a9878 100644 --- a/cgsmiles/tests/test_cgsmile_parsing.py +++ b/cgsmiles/tests/test_cgsmile_parsing.py @@ -11,7 +11,13 @@ [(0, 1), (1, 2)], [1, 1]), # smiple charges - ("{[#PMA+][#PEO][#PMA-0.25]}", + ("{[#PMA;+1][#PEO][#PMA;-0.25]}", + ["PMA", "PEO", "PMA"], + {0: 1.0, 1: 0.0, 2:-0.25}, + [(0, 1), (1, 2)], + [1, 1]), + # smiple charges with keyword + ("{[#PMA;q=+1][#PEO][#PMA;q=-0.25]}", ["PMA", "PEO", "PMA"], {0: 1.0, 1: 0.0, 2:-0.25}, [(0, 1), (1, 2)], @@ -256,184 +262,258 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): assert meta_mol.edges[edge]["order"] == order fragnames = nx.get_node_attributes(meta_mol, 'fragname') + print(fragnames) + print(nodes) assert nodes == list(fragnames.values()) if charges: set_charges = nx.get_node_attributes(meta_mol, 'charge') assert set_charges == charges -@pytest.mark.parametrize('big_smile, smile, bonding, rs, ez',( +@pytest.mark.parametrize('big_smile, smile, bonding, rs, ez, attrs',( # smiple symmetric bonding ("[$]COC[$]", "COC", {0: ["$1"], 2: ["$1"]}, None, + None, None), + # smiple symmetric bonding with weight + ("[$]C[O;0.5]C[$]", + "C[O]C", + {0: ["$1"], 2: ["$1"]}, + None, + None, + {'weight': {1: 0.5}}), + # smiple kwarg not part of the defaults + ("[$]C[O;q=4;p=s][C;q=3;p=l][$]", + "C[O][C]", + {0: ["$1"], 2: ["$1"]}, + None, + None, + {'q': {1: '4', 2: '3'}, 'p': {1: 's', 2: 'l'}}), + # smiple symmetric bonding with weight + # using cgsmiles string + ("[$][#TC4][#OT1;0.5][#CD1][$]", + "[#TC4][#OT1][#CD1]", + {0: ["$1"], 2: ["$1"]}, + None, + None, + {'weight': {1: 0.5}}), + # smiple symmetric bonding with random + # keyword argument + ("[$][#TC4][#OT1;r=abc][#CD1][$]", + "[#TC4][#OT1][#CD1]", + {0: ["$1"], 2: ["$1"]}, + None, + None, + {'r': {1: 'abc'}}), + # smiple symmetric bonding with weight + # using open smiles and hweights + ("[$]CO[C;0.5][$]([H;0.1])[H;0.2]", + "CO[C]([H])[H]", + {0: ["$1"], 2: ["$1"]}, + None, + None, + {'weight': {2: 0.5, 3: 0.1, 4: 0.2}}), + # H atom with weight goes first + ("[H;0.3]C[$]O[C;0.5][$]", + "[H]CO[C]", + {1: ["$1"], 3: ["$1"]}, + None, + None, + {'weight': {0: 0.3, 3: 0.5}}), # smiple symmetric bonding after branch ("[$]CC(CC)[$]", "CC(CC)", {0: ["$1"], 1: ["$1"]}, None, + None, None), # smiple symmetric bonding after ring ("[$]CC1[$]CCC1", "CC1CCC1", {0: ["$1"], 1: ["$1"]}, None, + None, None), # clear order symbol ("[CH][$a]=[CH][$c]", "[CH]=[CH]", {0: ["$a1"], 1: ["$c1"]}, None, + None, None), # multiple non-one bonding l ("CC=[$a]=[$b]CC", "CCCC", {1: ["$a2", "$b2"]}, None, + None, None), # multiple non-one bonding l ("CC[$a]=[$b]CC", "CCCC", {1: ["$a1", "$b2"]}, None, + None, None), # smiple symmetric bonding with more than one name ("[$1A]COC[$1A]", "COC", {0: ["$1A1"], 2: ["$1A1"]}, None, + None, None), # smiple bonding multiletter atom ("Clc[$]c[$]", "Clcc", {1: ["$1"], 2: ["$1"]}, None, + None, None), # simple symmetric but with explicit hydrogen ("[$][CH2]O[CH2][$]", "[CH2]O[CH2]", {0: ["$1"], 2: ["$1"]}, None, + None, None), # smiple symmetric bonding; multiple descript ("[$]COC[$][$1]", "COC", {0: ["$1"], 2: ["$1", "$11"]}, None, + None, None), # named different bonding descriptors ("[$1]CCCC[$2]", "CCCC", {0: ["$11"], 3: ["$21"]}, None, + None, None), # ring and bonding descriptors ("[$1]CC[$2]C1CCCCC1", "CCC1CCCCC1", {0: ["$11"], 1: ["$21"]}, None, + None, None), # bonding descript. after branch ("C(COC[$1])[$2]CCC[$3]", "C(COC)CCC", {0: ["$21"], 3: ["$11"], 6: ["$31"]}, None, + None, None), # left rigth bonding desciptors ("[>]COC[<]", "COC", {0: [">1"], 2: ["<1"]}, None, + None, None), # simple chirality in residue ("[>]C[C@](F)(B)N[<]", "C[C](F)(B)N", {0: [">1"], 4: ["<1"]}, {1: ('@', [])}, + None, None), # simple chirality inverse in residue ("[>]C[C@@](F)(B)N[<]", "C[C](F)(B)N", {0: [">1"], 4: ["<1"]}, {1: ('@@', [])}, + None, None), # \ fragment split ("[>]CC(\F)=[<]", "CC(F)", {0: [">1"], 1: ["<2"]}, None, - {2: (2, 1, '\\')}), + {2: (2, 1, '\\')}, + None), # / fragment split ("[>]CC(/F)=[<]", "CC(F)", {0: [">1"], 1: ["<2"]}, None, - {2: (2, 1, '/')}), + {2: (2, 1, '/')}, + None), # both in one fragment ("[>]CC(/F)=C(\F)C[<]", "CC(F)=C(F)C", {0: [">1"], 5: ["<1"]}, None, - {2: (2, 1, '/'), 4: (4, 3, '\\')}), + {2: (2, 1, '/'), 4: (4, 3, '\\')}, + None), )) -def test_strip_bonding_descriptors(big_smile, smile, bonding, rs, ez): - new_smile, new_bonding, rs_isomers, ez_isomers = strip_bonding_descriptors(big_smile) +def test_strip_bonding_descriptors(big_smile, smile, bonding, rs, ez, attrs): + new_smile, new_bonding, rs_isomers, ez_isomers, attrs_out = strip_bonding_descriptors(big_smile) assert new_smile == smile assert new_bonding == bonding if rs: assert rs == rs_isomers if ez: assert ez == ez_isomers + # here we check that the weights are correctly + # set for nodes with weights; the default is + # checked in another test + print(attrs_out) + if attrs: + for attr, node_attrs in attrs.items(): + for node, value in node_attrs.items(): + assert attrs_out[node][attr] == value @pytest.mark.parametrize('fragment_str, nodes, edges',( # single fragment ("{#PEO=[$]COC[$]}", - {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3}), - (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0}), - (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3}), + {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "weight": 1}), + (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "weight": 1}), + (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "weight": 1}), )}, {"PEO": [(0, 1), (1, 2)]}), # single fragment but with explicit hydrogen in smiles ("{#PEO=[$][CH2]O[CH2][$]}", - {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2}), - (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0}), - (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2}), + {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2, "weight": 1}), + (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "weight": 1}), + (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2, "weight": 1}), )}, {"PEO": [(0, 1), (1, 2)]}), # test NH3 terminal ("{#AMM=N[$]}", - {"AMM": ((0, {"atomname": "N0", "fragname": "AMM", "bonding": ["$1"], "element": "N", "hcount": 3}), + {"AMM": ((0, {"atomname": "N0", "fragname": "AMM", "bonding": ["$1"], "element": "N", "hcount": 3, "weight": 1}), )}, {"AMM": []}), # single fragment + 1 terminal (i.e. only 1 bonding descrpt ("{#PEO=[$]COC[$],#OHter=[$][OH]}", - {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3}), - (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0}), - (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3}), + {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "weight": 1}), + (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "weight": 1}), + (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "weight": 1}), ), - "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O"}),)}, + "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "weight": 1}),)}, {"PEO": [(0, 1), (1, 2)], "OHter": []}), # single fragment + 1 terminal but multiple bond descritp. # this adjust the hydrogen count ("{#PEO=[$]COC[$][$1],#OHter=[$][OH]}", - {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3}), - (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0}), - (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1", "$11"], "element": "C", "hcount": 3}), + {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "weight": 1}), + (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "weight": 1}), + (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1", "$11"], "element": "C", "hcount": 3, "weight": 1}), ), - "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "hcount": 1}),)}, + "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "hcount": 1, "weight": 1}),)}, {"PEO": [(0, 1), (1, 2)], "OHter": []}), # single fragment + 1 terminal but multiple bond descritp. # but explicit hydrogen in the smiles string ("{#PEO=[$][CH2]O[CH2][$][$1],#OHter=[$][OH]}", - {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2}), - (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0}), - (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1", "$11"], "element": "C", "hcount": 2}), + {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2, "weight": 1}), + (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "weight": 1}), + (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1", "$11"], "element": "C", "hcount": 2, "weight": 1}), ), - "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "hcount": 1}), + "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "hcount": 1, "weight": 1}), )}, {"PEO": [(0, 1), (1, 2),], "OHter": []}), diff --git a/cgsmiles/tests/test_molecule_resolve.py b/cgsmiles/tests/test_molecule_resolve.py index 096c57d..ebc8299 100644 --- a/cgsmiles/tests/test_molecule_resolve.py +++ b/cgsmiles/tests/test_molecule_resolve.py @@ -40,9 +40,9 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): assert new_btypes == btypes -@pytest.mark.parametrize('smile, ref_frags, elements, ref_edges, chiral, ez',( +@pytest.mark.parametrize('smile, ref_frags, elements, ref_edges, chiral, ez, weights',( # smiple linear seqeunce - ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]COC[$],#OHter=[$][O]}", + ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]COC[$],#OHter=[$]O}", # 0 1 2 3 4 5 6 7 8 [('OHter', 'O H'), ('PEO', 'C O C H H H H'), # 9 10 11 12 13 14 15 16 17 @@ -51,7 +51,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7), (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13), (11, 14), (11, 15), (11, 16), (16, 17)], - {}, {}), + {}, {}, {}), # smiple linear seqeunce with bond-order in link ("{[#TC1][#TC4][#TC1]}.{#TC1=[$1]=CC=[$2],#TC4=[$1]=CC=[$2]}", # 0 1 2 3 4 5 6 7 8 9 @@ -61,9 +61,9 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): 'C C H H H H C C H H C C H H H H', [(0, 1), (0, 2), (1, 3), (1, 4), (1, 5), (0, 6), (6, 7), (6, 8), (7, 9), (7, 11), (10, 11), (10, 12), (10, 13), - (10, 14), (11, 15)], {}, {}), + (10, 14), (11, 15)], {}, {}, {}), # smiple linear seqeunce unconsumed bonding descrpt - ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]CO[>]C[$],#OHter=[$][O]}", + ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]CO[>]C[$],#OHter=[$]O}", # 0 1 2 3 4 5 6 7 8 [('OHter', 'O H'), ('PEO', 'C O C H H H H'), # 9 10 11 12 13 14 15 16 17 @@ -71,7 +71,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): 'O H C O C H H H H C O C H H H H O H', [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7), (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13), - (11, 14), (11, 15), (11, 16), (16, 17)], {}, {}), + (11, 14), (11, 15), (11, 16), (16, 17)], {}, {}, {}), # smiple linear seqeunce with ionic bond ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]COC[$],#OHter=[$][O-].[Na+]}", # 0 1 2 3 4 5 6 7 8 @@ -81,7 +81,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): 'O Na C O C H H H H C O C H H H H O Na', [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7), (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13), - (11, 14), (11, 15), (11, 16), (16, 17)], {}, {}), + (11, 14), (11, 15), (11, 16), (16, 17)], {}, {}, {}), # smiple linear seqeunce with ionic ending ("{[#OH][#PEO]|2[#ON]}.{#PEO=[$]COC[$],#OH=[$]O,#ON=[$][O-]}", # 0 1 2 3 4 5 6 7 8 @@ -91,7 +91,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): 'O H C O C H H H H C O C H H H H O', [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7), (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13), - (11, 14), (11, 15), (11, 16)], {}, {}), + (11, 14), (11, 15), (11, 16)], {}, {}, {}), # uncomsumed bonding IDs; note that this is not the same # molecule as previous test case. Here one of the OH branches # and replaces an CH2 group with CH-OH @@ -103,7 +103,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): 'O H C O C H H H H C O C H H H H O H', [(0, 1), (0, 2), (2, 3), (2, 5), (2, 11), (3, 4), (4, 6), (4, 7), (4, 8), (9, 10), (9, 12), (9, 13), - (10, 11), (11, 15), (11, 14), (9, 16), (16, 17)], {}, {}), + (10, 11), (11, 15), (11, 14), (9, 16), (16, 17)], {}, {}, {}), # simple branched sequence ("{[#Hter][#PE]([#PEO][#Hter])[#PE]([#PEO][#Hter])[#Hter]}.{#Hter=[$]H,#PE=[$]CC[$][$],#PEO=[$]COC[$]}", [('Hter', 'H'), ('PE', 'C C H H H'), ('PEO', 'C O C H H H H'), ('Hter', 'H'), @@ -111,11 +111,11 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): 'H C C H H H C O C H H H H H C C H H H C O C H H H H H H', [(0, 1), (1, 2), (1, 3), (1, 4), (2, 5), (2, 6), (2, 14), (6, 7), (6, 9), (6, 10), (7, 8), (8, 11), (8, 12), (8, 13), (14, 15), (14, 16), (14, 17), (15, 18), (15, 19), (15, 27), - (19, 20), (19, 22), (19, 23), (20, 21), (21, 24), (21, 25), (21, 26)], {}, {}), + (19, 20), (19, 22), (19, 23), (20, 21), (21, 24), (21, 25), (21, 26)], {}, {}, {}), # something with a ring # 012 34567 # 890123456 - ("{[#Hter][#PS]|2[#Hter]}.{#PS=[$]CC[$]c1ccccc1,#Hter=[$]H}", + ("{[#Hter][#PS]|2[#Hter]}.{#PS=[$]CC[$]c1ccccc1,#Hter=[$][H]}", [('Hter', 'H'), ('PS', 'C C C C C C C C H H H H H H H H'), ('PS', 'C C C C C C C C H H H H H H H H'), ('Hter', 'H')], 'H C C C C C C C C H H H H H H H H C C C C C C C C H H H H H H H H H', @@ -124,7 +124,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): (6, 14), (7, 8), (7, 15), (8, 16), (17, 18), (17, 25), (17, 26), (18, 19), (18, 27), (18, 33), (19, 20), (19, 24), (20, 21), (20, 28), (21, 22), (21, 29), (22, 23), (22, 30), - (23, 24), (23, 31), (24, 32)], {}, {}), + (23, 24), (23, 31), (24, 32)], {}, {}, {}), # something more complicated branched # here we have multiple bonding descriptors # # despite being the same residue we have 3 fragments after adding hydrgens @@ -146,7 +146,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): [('A', 'O H C H H'), ('B', 'C H H C H H H'),], 'O H C H H C H H H', [(0, 1), (0, 2), (2, 3), (2, 4), (2, 5), - (5, 6), (5, 7), (5, 8)], {}, {}), + (5, 6), (5, 7), (5, 8)], {}, {}, {}), # smiple squash operator; unconsumed operators ("{[#A][#B]}.{#A=OC[!],#B=[$][!]CC}", # 0 1 2 3 4 1 5 3 4 6 7 8 @@ -157,7 +157,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): [('A', 'O H C H H'), ('B', 'C H H C H H H'),], 'O H C H H C H H H', [(0, 1), (0, 2), (2, 3), (2, 4), (2, 5), - (5, 6), (5, 7), (5, 8)], {}, {}), + (5, 6), (5, 7), (5, 8)], {}, {}, {}), # smiple squash operator; plus connect operator ("{[#A][#B][#C]}.{#A=OC[!],#B=[$][!]CC,#C=[$]O}", # 0 1 2 3 4 1 5 3 4 6 7 8 @@ -168,7 +168,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): [('A', 'O H C H'), ('B', 'C H C H H H'), ('C', 'O H')], 'O H C H C H H H O H', [(0, 1), (0, 2), (2, 3), (2, 4), - (4, 5), (4, 6), (4, 7), (2, 8), (8, 9)], {}, {}), + (4, 5), (4, 6), (4, 7), (2, 8), (8, 9)], {}, {}, {}), # THF like test case with double edge and squash operator ("{[#A]=[#B]}.{#A=[!]COC[!],#B=[!]CCCC[!]}", [('A', 'O C C H H H H'), @@ -176,7 +176,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): 'O C C H H H H C C H H H H', [(0, 2), (0, 3), (2, 4), (2, 5), (3, 6), (3, 7), (2, 8), (3, 9), - (8, 9), (9, 12), (9, 13), (8, 10), (8, 11)], {}, {}), + (8, 9), (9, 12), (9, 13), (8, 10), (8, 11)], {}, {}, {}), # Toluene like test case with squash operator and aromaticity ("{[#SC3]1[#TC5][#TC5]1}.{#SC3=Cc(c[!])c[!],#TC5=[!]ccc[!]}", [('SC3', 'C C H H H C H C H'), @@ -184,7 +184,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): 'C C H H H C H C H C H C H C H', [(0, 1), (0, 2), (0, 3), (0, 4), (1, 5), (1, 7), (5, 9), (5, 6), (7, 13), (7, 8), - (9, 11), (9, 10), (11, 13), (11, 12), (13, 14)], {}, {}), + (9, 11), (9, 10), (11, 13), (11, 12), (13, 14)], {}, {}, {}), # simple chirality assigment with rings ("{[#GLC]}.{#GLC=C([C@@H]1[C@H]([C@@H]([C@H]([C@H](O1)O)O)O)O)O}", # 0 1 2 3 @@ -194,7 +194,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): (2, 15), (3, 4), (3, 9), (3, 16), (4, 5), (4, 8), (4, 17), (5, 6), (5, 7), (5, 18), (7, 19), (8, 20), (9, 21), (10, 22), (11, 23)], {1: (6, 14, 2, 0), 2: (1, 15, 3, 10), 3: (2, 16, 9, 4), - 4: (3, 17, 5, 8), 5: (4, 18, 6, 7)}, {}), + 4: (3, 17, 5, 8), 5: (4, 18, 6, 7)}, {}, {}), # simple chirality assigment between fragments ("{[#A][#B][#C]}.{#A=O[>],#C=O[<],#B=[<]C[C@H][>]C(=O)OC}", # 0 1 2 3 @@ -204,7 +204,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): [(0, 1), (0, 2), (2, 3), (2, 4), (2, 5), (5, 6), (5, 7), (7, 8), (7, 9), (9, 10), (10, 11), (10, 12), (10, 13), (5, 14), (14, 15)], - {3: (2, 10, 4, 14)}, {}), + {3: (2, 10, 4, 14)}, {}, {}), # simple chirality assigment between fragments inv ("{[#A][#B][#C]}.{#A=O[>],#C=O[<],#B=[<]C[C@@H][>]C(=O)OC}", # 0 1 2 3 @@ -214,14 +214,14 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): [(0, 1), (0, 2), (2, 3), (2, 4), (2, 5), (5, 6), (5, 7), (7, 8), (7, 9), (9, 10), (10, 11), (10, 12), (10, 13), (5, 14), (14, 15)], - {3: (2, 10, 14, 4)}, {}), + {3: (2, 10, 14, 4)}, {}, {}), # smiple ez isomerism assigment between fragments inv ("{[#A][#B]}.{#A=CC(/F)=[$],#B=[$]=C(\F)C}", [('A', 'C C F H H H'), ('B', 'C F C H H H')], 'C C F H H H F C C H H H', [(0, 1), (1, 2), (0, 3), (0, 4), (0, 5), (1, 7), (7, 6), (7, 8), (8, 9), (8, 10), (8, 11)], - {}, {2: (2, 1, 6, 7, 'trans'), 7: (7, 6, 1, 2, 'trans')}), + {}, {2: (2, 1, 6, 7, 'trans'), 7: (7, 6, 1, 2, 'trans')}, {}), # have more than one e/z pair ("{[#A][#B][#B][#C]}.{#A=CC(/F)=[>],#B=[<]=C(\F)C=[>],#C=[<]=C(\F)C}", [('A', 'C C F H H H'), ('B', 'C F C H'), @@ -231,14 +231,14 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): (8, 10), (8, 9), (10, 11), (10, 12), (12, 14), (12, 13), (14, 15), (14, 16), (16, 17), (16, 18), (16, 19)], {}, {2: (2, 1, 6, 7, 'trans'), 7: (7, 6, 1, 2, 'trans'), - 11: (11, 10, 14, 15, 'cis'), 15: (15, 14, 10, 11, 'cis')}), + 11: (11, 10, 14, 15, 'cis'), 15: (15, 14, 10, 11, 'cis')}, {}), # simple ez isomerism assigment between fragments inv ("{[#A][#B]}.{#A=CC(/F)=[$],#B=[$]=C(/F)C}", [('A', 'C C F H H H'), ('B', 'C F C H H H')], 'C C F H H H F C C H H H', [(0, 1), (1, 2), (0, 3), (0, 4), (0, 5), (1, 7), (7, 6), (7, 8), (8, 9), (8, 10), (8, 11)], - {}, {2: (2, 1, 6, 7, 'cis'), 7: (7, 6, 1, 2, 'cis')}), + {}, {2: (2, 1, 6, 7, 'cis'), 7: (7, 6, 1, 2, 'cis')}, {}), # test skip virtual nodes ("{[#SP4]1.2[#SP4].3[#SP1r]1.[#TC4]23}.{#SP4=OC[$]C[$]O,#SP1r=[$]OC[$]CO}", [('SP4', 'O C C O H H H H'), ('SP4', 'O C C O H H H H'), @@ -248,9 +248,40 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): (3, 7), (8, 9), (8, 12), (9, 10), (9, 13), (10, 11), (10, 17), (10, 14), (11, 15), (16, 17), (17, 18), (17, 20), (18, 19), (18, 21), (18, 22), (19, 23)], - {},{}), + {},{}, {}), + # test weights + ("{[#SP4]1[#SP4][#SP1r]1}.{#SP4=[OH;0.5]C[$]C[$]O,#SP1r=[$]OC[$]CO}", + [('SP4', 'O C C O H H H H'), ('SP4', 'O C C O H H H H'), + ('SP1r', 'O C C O H H H H')], + 'O C C O H H H H O C C O H H H H O C C O H H H H', + [(0, 1), (0, 4), (1, 2), (1, 9), (1, 5), (2, 3), (2, 16), (2, 6), + (3, 7), (8, 9), (8, 12), (9, 10), (9, 13), (10, 11), (10, 17), + (10, 14), (11, 15), (16, 17), (17, 18), (17, 20), (18, 19), + (18, 21), (18, 22), (19, 23)], + {},{}, {0: 0.5, 1: 1, 2: 1, 3: 1, 4: 0.5, 5: 1, 6: 1, 7: 1, 8: 0.5, + 9: 1, 10: 1, 11: 1, 12: 0.5, 13: 1, 14: 1, 15: 1, 16: 1, 17: 1, + 18: 1, 19: 1, 20: 1, 21: 1, 22: 1, 23: 1}), + # test 2 weights + ("{[#SP4]1[#SP4][#SP1r]1}.{#SP4=[OH;0.5][C;0.1][$]C[$]O,#SP1r=[$]OC[$]CO}", + [('SP4', 'O C C O H H H H'), ('SP4', 'O C C O H H H H'), + ('SP1r', 'O C C O H H H H')], + 'O C C O H H H H O C C O H H H H O C C O H H H H', + [(0, 1), (0, 4), (1, 2), (1, 9), (1, 5), (2, 3), (2, 16), (2, 6), + (3, 7), (8, 9), (8, 12), (9, 10), (9, 13), (10, 11), (10, 17), + (10, 14), (11, 15), (16, 17), (17, 18), (17, 20), (18, 19), + (18, 21), (18, 22), (19, 23)], + {},{}, {0: 0.5, 1: 0.1, 5: 0.1, 4: 0.5, 8: 0.5, 9: 0.1, 12: 0.5, 13: 0.1}), + # test 2 weights and hydrogen weights + ("{[#SP4]1[#SP4][#SP1r]1}.{#SP4=[O;0.5]([H;0.2])[C;0.1][$]C[$]O,#SP1r=[$]OC[$]CO}", + [('SP4', 'O H C C O H H H'), ('SP4', 'O H C C O H H H'), + ('SP1r', 'O C C O H H H H')], + 'O H C C O H H H O H C C O H H H O C C O H H H H', + [(0, 1), (0, 2), (2, 3), (2, 10), (2, 5), (3, 4), (3, 16), (3, 6), (4, 7), (8, 9), + (8, 10), (10, 11), (10, 13), (11, 12), (11, 17), (11, 14), (12, 15), (16, 17), + (17, 18), (17, 20), (18, 19), (18, 21), (18, 22), (19, 23)], + {},{}, {0: 0.5, 1: 0.2, 2: 0.1, 5: 0.1, 8: 0.5, 9: 0.2, 10: 0.1, 13: 0.1}), )) -def test_all_atom_resolve_molecule(smile, ref_frags, elements, ref_edges, chiral, ez): +def test_all_atom_resolve_molecule(smile, ref_frags, elements, ref_edges, chiral, ez, weights): meta_mol, molecule = MoleculeResolver.from_string(smile).resolve() # loop and compare fragments first @@ -284,6 +315,12 @@ def _ele_match(n1, n2): if ez: ez_assigned = nx.get_node_attributes(molecule, 'ez_isomer') assert ez == ez_assigned + # check weights + if weights: + mol_weights = {node: 1 for node in ref_graph} + mol_weights.update(weights) + weights_assigned = nx.get_node_attributes(molecule, 'weight') + assert mol_weights == weights_assigned @pytest.mark.parametrize('case, cgsmiles_str, ref_string',( # case 1: here only the meta-graph is described by the @@ -312,13 +349,16 @@ def _atomname_match(n1, n2): return n1["fragname"] == n2["atomname"] assert nx.is_isomorphic(ref_graph, molecule, node_match=_atomname_match) -@pytest.mark.parametrize('cgsmiles_str, error_message',( -(("{[#A][#B]}.{#A=CC[$]}", "Found node #B but no corresponding fragment."), - ("{[#A][#B]1}.{#A=CC[$],#B=OC[$]}", "You have a dangling ring index."), - ("{[#A]1[#B]1}{#A=CC[$],#B=OC[$]}", "You define two edges between the same node. Use bond order symbols instead."), +@pytest.mark.parametrize('cgsmiles_str, error_message, error_type',( +(("{[#A][#B]}.{#A=CC[$]}", "Found node #B but no corresponding fragment.", SyntaxError), + ("{[#A][#B]1}.{#A=CC[$],#B=OC[$]}", "You have a dangling ring index.", SyntaxError), + ("{[#A]1[#B]1}{#A=CC[$],#B=OC[$]}", "You define two edges between the same node. Use bond order symbols instead.", SyntaxError), + ("{[#A;w=abc][#B]}.{#A=CC[$],#B=OC[$]}", "Argument 'w' must be of type float.", TypeError), + ("{[#A;w=ab=c][#B]}.{#A=CC[$],#B=OC[$]}", "Your annotation w=ab=c contains too many = charachters. Only one chacracter per key value pair is allowed", SyntaxError), + ("{[#A;w=1,c=1,q=a;d][#B]}.{#A=CC[$],#B=OC[$]}", "You have too many positional arguments or ; as part of key value pairs which is not allowed.", SyntaxError), ))) -def test_syntax_errors(cgsmiles_str, error_message): - with pytest.raises(SyntaxError) as e_message: +def test_syntax_errors(cgsmiles_str, error_message, error_type): + with pytest.raises(error_type) as e_message: resolver = MoleculeResolver.from_string(cgsmiles_str) cg_mol, aa_mol = resolver.resolve() assert e_message == error_message diff --git a/cgsmiles/tests/test_sampler.py b/cgsmiles/tests/test_sampler.py index e1b08cd..8ccfd01 100644 --- a/cgsmiles/tests/test_sampler.py +++ b/cgsmiles/tests/test_sampler.py @@ -104,6 +104,7 @@ def test_add_fragment(graph_str, ref_graph = read_cgsmiles(ref_mol) nx.set_node_attributes(ref_graph, bonding, 'bonding') nx.set_node_attributes(ref_graph, fragid, 'fragid') + nx.set_node_attributes(ref_graph, 1, 'w') atomnames = nx.get_node_attributes(ref_graph, 'fragname') nx.set_node_attributes(ref_graph, atomnames, 'atomname') nx.set_node_attributes(ref_graph, resnames, 'fragname') diff --git a/cgsmiles/tests/test_utils.py b/cgsmiles/tests/test_utils.py index fa0c730..b485e58 100644 --- a/cgsmiles/tests/test_utils.py +++ b/cgsmiles/tests/test_utils.py @@ -16,6 +16,7 @@ ('{#A=[$]=CCC=[$]}', 4, None, None), ('{#A=[$]cccc}',5, None, None), ('{#A=[$]ccc}', 0, SyntaxError, err_msg_rebuild_h), + ('{#A=[$]C(Cl)(Cl)(Cl)(Cl)}', 0, None, None), )) def test_rebuild_hatoms(frag_str, hatoms_ref, error_type, err_msg): frag_dict = cgsmiles.read_fragments(frag_str) diff --git a/cgsmiles/tests/test_write_cgsmiles.py b/cgsmiles/tests/test_write_cgsmiles.py index 8bfae93..836c3bf 100644 --- a/cgsmiles/tests/test_write_cgsmiles.py +++ b/cgsmiles/tests/test_write_cgsmiles.py @@ -22,8 +22,13 @@ )) def test_write_fragments(input_string): frag_dict = read_fragments(input_string) + for g in frag_dict.values(): + print(g.nodes(data=True)) out_string = write_cgsmiles_fragments(frag_dict, smiles_format=True) frag_dict_out = read_fragments(out_string) + for g in frag_dict_out.values(): + print(g.nodes(data=True)) + print(out_string) assert set(frag_dict_out) == set(frag_dict) for fragname in frag_dict: assertEqualGraphs(frag_dict_out[fragname], frag_dict[fragname]) @@ -58,6 +63,7 @@ def test_write_cgsmiles(input_string): fragment_dicts = resolver.fragment_dicts molecule = resolver.molecule output_string = write_cgsmiles(molecule, fragment_dicts) + print(output_string) out_resolver = MoleculeResolver.from_string(output_string) out_mol = out_resolver.molecule assertEqualGraphs(molecule, out_mol)