diff --git a/cgsmiles/graph_utils.py b/cgsmiles/graph_utils.py index b8d5038..1bbd88c 100644 --- a/cgsmiles/graph_utils.py +++ b/cgsmiles/graph_utils.py @@ -37,13 +37,13 @@ def merge_graphs(source_graph, target_graph, max_node=None): # We assume that the last id is always the largest. last_node_idx = max_node offset = last_node_idx - fragment_offset = source_graph.nodes[last_node_idx].get('fragid', 0) + fragment_offset = max(source_graph.nodes[last_node_idx].get('fragid', [0])) + 1 correspondence = {} for idx, node in enumerate(target_graph.nodes(), start=offset + 1): correspondence[node] = idx new_atom = copy.copy(target_graph.nodes[node]) - new_atom['fragid'] = (new_atom.get('fragid', 0) + fragment_offset) + new_atom['fragid'] = [(new_atom.get('fragid', 0) + fragment_offset)] source_graph.add_node(idx, **new_atom) for node1, node2 in target_graph.edges: @@ -53,31 +53,27 @@ def merge_graphs(source_graph, target_graph, max_node=None): return correspondence -def sort_nodes(graph, sortby_attrs=("fragid", "atomid"), target_attr=None): +def sort_nodes_by_attr(graph, sort_attr="fragid"): """ - Sort nodes in graph by multiple attributes. + Sort nodes in graph by attribute and relable the graph in place. Parameters ---------- graph: :class:`nx.Graph` the graph to sort nodes of - sortby_attrs: tuple(str) - the attributes and in which order to sort - target_attr: `abc.hashable` - if not None indices are assigned to this - attribute starting at 1 + sort_attr: `abc.hashable` + the attribute to use for sorting Returns ------- nx.Graph graph with nodes sorted in correct order """ - node_order = sorted(graph, key=partial(_keyfunc, graph, attrs=sortby_attrs)) - for new_idx, node_key in enumerate(node_order, 1): -# graph._node.move_to_end(node_key) - if target_attr is not None: - graph.nodes[node_key][target_attr] = new_idx - return graph + attr_values = nx.get_node_attributes(graph, sort_attr) + sorted_ids = sorted(attr_values, key=lambda item: (attr_values[item], item)) + mapping = {old: new for new, old in enumerate(sorted_ids)} + new_graph = nx.relabel_nodes(graph, mapping, copy=True) + return new_graph def _keyfunc(graph, node_idx, attrs): """ @@ -95,23 +91,26 @@ def annotate_fragments(meta_graph, molecule): """ node_to_fragids = nx.get_node_attributes(molecule, 'fragid') - fragids = defaultdict(list) - for fragid, node in node_to_fragids.items(): - fragids[fragid].append(node) + fragid_to_node = defaultdict(list) + for node, fragids in node_to_fragids.items(): + for fragid in fragids: + fragid_to_node[fragid].append(node) for meta_node in meta_graph.nodes: # adding node to the fragment graph graph_frag = nx.Graph() - for node in fragids[meta_node]: + for node in fragid_to_node[meta_node]: attrs = molecule.nodes[node] graph_frag.add_node(node, **attrs) # adding the edges # this is slow but OK; we always assume that the fragment # is much much smaller than the fullblown graph - combinations = itertools.combinations(fragids[meta_node], r=2) + combinations = itertools.combinations(fragid_to_node[meta_node], r=2) for a, b in combinations: if molecule.has_edge(a, b): - graph.add_edge(a, b) + graph_frag.add_edge(a, b) + + meta_graph.nodes[meta_node]['graph'] = graph_frag return meta_graph diff --git a/cgsmiles/pysmiles_utils.py b/cgsmiles/pysmiles_utils.py new file mode 100644 index 0000000..5c9f4e5 --- /dev/null +++ b/cgsmiles/pysmiles_utils.py @@ -0,0 +1,58 @@ +import pysmiles + +VALENCES = pysmiles.smiles_helper.VALENCES +VALENCES.update({"H": (1,)}) + +def rebuild_h_atoms(mol_graph, keep_bonding=False): + """ + Helper function which add hydrogen atoms to the molecule graph. + + First the hcount attribute produced by pysmiles is updated, because + fragments have no bonds at time of reading so pysmiles does not + know the connectivity. Hence the hcount is redone based on the + actual connectivity of the final molecule. + + This function also makes sure to tag single hydrogen fragments, + in order to not merge them with adjecent fragments. Otherwise, + explicit single hydrogen residues would not be possible. + + The molecule graph is updated in place with the hydrogen atoms + that are missing. + + Using the keep_bonding argument the hydrogen count is reduced + by the number of bonding descriptors. In this way hydrogen + atoms can also be added to fragments only. + + Parameters + ---------- + mol_graph: :class:`nx.Graph` + graph describing the full molecule without hydrogen atoms + """ + for node in mol_graph.nodes: + if mol_graph.nodes[node].get('bonding', False): + # get the degree + ele = mol_graph.nodes[node]['element'] + # hcount is the valance minus the degree minus + # the number of bonding descriptors + bonds = round(sum([mol_graph.edges[(node, neigh)]['order'] for neigh in\ + mol_graph.neighbors(node)])) + charge = mol_graph.nodes[node].get('charge', 0) + hcount = pysmiles.smiles_helper._valence(mol_graph, node, minimum=0) -\ + bonds +\ + charge + # in this case we only rebuild hydrogen atoms that are not + # replaced by bonding operators. + if keep_bonding: + hcount -= len(mol_graph.nodes[node]['bonding']) + + mol_graph.nodes[node]['hcount'] = hcount + if ele == "H": + mol_graph.nodes[node]['single_h_frag'] = True + + pysmiles.smiles_helper.add_explicit_hydrogens(mol_graph) + for node in mol_graph.nodes: + if mol_graph.nodes[node].get("element", "*") == "H" and\ + not mol_graph.nodes[node].get("single_h_frag", False): + ref_node = next(mol_graph.neighbors(node)) + mol_graph.nodes[node]["fragid"] = mol_graph.nodes[ref_node]["fragid"] + mol_graph.nodes[node]["fragname"] = mol_graph.nodes[ref_node]["fragname"] diff --git a/cgsmiles/read_fragments.py b/cgsmiles/read_fragments.py index d8740b0..101ba3b 100644 --- a/cgsmiles/read_fragments.py +++ b/cgsmiles/read_fragments.py @@ -6,6 +6,36 @@ import pysmiles from .read_cgsmiles import read_cgsmiles +class PeekIter(object): + """ + Custom iter that allows looking ahead, without + advancing the actual iter. + """ + def __init__(self, collection): + self.collection = iter(collection) + self._peek = None + + def __next__(self): + if self._peek: + item = self._peek + self._peek = None + else: + item = next(self.collection) + return item + + def peek(self): + if self._peek: + return self._peek + try: + self._peek = next(self) + except StopIteration: + self._peek = None + return self._peek + + def __iter__(self): + return self + + def strip_bonding_descriptors(fragment_string): """ Processes a CGBigSmile fragment string by @@ -27,21 +57,30 @@ def strip_bonding_descriptors(fragment_string): a dict mapping bonding descriptors to the nodes within the string """ - smile_iter = iter(fragment_string) + bond_to_order = {'-': 1, '=': 2, '#': 3, '$': 4, ':': 1.5, '.': 0} + smile_iter = PeekIter(fragment_string) bonding_descrpt = defaultdict(list) smile = "" node_count = 0 prev_node = 0 + current_order = None for token in smile_iter: if token == '[': peek = next(smile_iter) - if peek in ['$', '>', '<']: + if peek in ['$', '>', '<', '!']: bond_descrp = peek peek = next(smile_iter) while peek != ']': bond_descrp += peek peek = next(smile_iter) - bonding_descrpt[prev_node].append(bond_descrp) + if smile_iter.peek() in bond_to_order: + order = bond_to_order[next(smile_iter)] + elif current_order: + order = current_order + current_order = None + else: + order = 1 + bonding_descrpt[prev_node].append(bond_descrp + str(order)) else: atom = token while peek != ']': @@ -59,47 +98,16 @@ def strip_bonding_descriptors(fragment_string): elif token == ')': prev_node = anchor smile += token + elif token in bond_to_order: + current_order = bond_to_order[token] else: - if token not in '] H @ . - = # $ : / \\ + - %'\ + if token not in '] H @ $ / \\ + - %'\ and not token.isdigit(): prev_node = node_count node_count += 1 smile += token return smile, bonding_descrpt -def _rebuild_h_atoms(mol_graph): - # special hack around to fix - # pysmiles bug for a single - # atom molecule; we assume that the - # hcount is just wrong and set it to - # the valance number minus bonds minus - # bonding connectors - if len(mol_graph.nodes) == 1: - ele = mol_graph.nodes[0]['element'] - # for N and P we assume the regular valency - hcount = pysmiles.smiles_helper.VALENCES[ele][0] - if mol_graph.nodes[0].get('bonding', False): - hcount -= 1 - mol_graph.nodes[0]['hcount'] = hcount - else: - for node in mol_graph.nodes: - if mol_graph.nodes[node].get('bonding', False): - # get the degree - ele = mol_graph.nodes[node]['element'] - # hcount is the valance minus the degree minus - # the number of bonding descriptors - bonds = round(sum([mol_graph.edges[(node, neigh)]['order'] for neigh in\ - mol_graph.neighbors(node)])) - charge = mol_graph.nodes[node].get('charge', 0) - hcount = pysmiles.smiles_helper.VALENCES[ele][0] -\ - bonds -\ - len(mol_graph.nodes[node]['bonding']) +\ - charge - mol_graph.nodes[node]['hcount'] = hcount - - pysmiles.smiles_helper.add_explicit_hydrogens(mol_graph) - return mol_graph - def fragment_iter(fragment_str, all_atom=True): """ Iterates over fragments defined in a CGBigSmile string. @@ -137,8 +145,6 @@ def fragment_iter(fragment_str, all_atom=True): elif all_atom: mol_graph = pysmiles.read_smiles(smile) nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding') - # we need to rebuild hydrogen atoms now - _rebuild_h_atoms(mol_graph) # we deal with a CG resolution graph else: mol_graph = read_cgsmiles(smile) diff --git a/cgsmiles/resolve.py b/cgsmiles/resolve.py index fedc3c0..a0a373d 100644 --- a/cgsmiles/resolve.py +++ b/cgsmiles/resolve.py @@ -1,12 +1,11 @@ import re +import copy import networkx as nx import pysmiles from .read_cgsmiles import read_cgsmiles from .read_fragments import read_fragments -from .graph_utils import merge_graphs, sort_nodes, annotate_fragments - -VALENCES = pysmiles.smiles_helper.VALENCES -VALENCES.update({"H": (1,)}) +from .graph_utils import merge_graphs, sort_nodes_by_attr, annotate_fragments +from .pysmiles_utils import rebuild_h_atoms def compatible(left, right): """ @@ -22,7 +21,7 @@ def compatible(left, right): ------- bool """ - if left == right and left not in '> <': + if left == right and left[0] not in '> <': return True l, r = left[0], right[0] if (l, r) == ('<', '>') or (l, r) == ('>', '<'): @@ -129,6 +128,10 @@ def __init__(self, def resolve_disconnected_molecule(self): """ + Given a connected graph of nodes with associated fragment graphs + generate a disconnected graph of the fragments and annotate + each fragment graph to the node in the higher resolution + graph. """ for meta_node in self.meta_graph.nodes: fragname = self.meta_graph.nodes[meta_node]['fragname'] @@ -139,19 +142,20 @@ def resolve_disconnected_molecule(self): for node in fragment.nodes: new_node = correspondence[node] - attrs = self.molecule.nodes[new_node] + attrs = copy.deepcopy(self.molecule.nodes[new_node]) graph_frag.add_node(correspondence[node], **attrs) - nx.set_node_attributes(graph_frag, meta_node, 'fragid') + nx.set_node_attributes(graph_frag, [meta_node], 'fragid') for a, b in fragment.edges: new_a = correspondence[a] new_b = correspondence[b] + attrs = copy.deepcopy(fragment.edges[(a, b)]) graph_frag.add_edge(new_a, - new_b) + new_b, + **attrs) self.meta_graph.nodes[meta_node]['graph'] = graph_frag - def edges_from_bonding_descrpt(self): """ Make edges according to the bonding descriptors stored @@ -167,54 +171,39 @@ def edges_from_bonding_descrpt(self): edge, bonding = generate_edge(prev_graph, node_graph) - #To DO - clean copying of bond-list attribute - # this is a bit of a workaround because at this stage the - # bonding list is actually shared between all residues of - # of the same type; so we first make a copy then we replace - # the list sans used bonding descriptor - prev_bond_list = prev_graph.nodes[edge[0]]['bonding'].copy() - prev_bond_list.remove(bonding[0]) - prev_graph.nodes[edge[0]]['bonding'] = prev_bond_list - node_bond_list = node_graph.nodes[edge[1]]['bonding'].copy() - node_bond_list.remove(bonding[1]) - node_graph.nodes[edge[1]]['bonding'] = node_bond_list - order = re.findall("\d+\.\d+", bonding[0]) + # remove used bonding descriptors + prev_graph.nodes[edge[0]]['bonding'].remove(bonding[0]) + node_graph.nodes[edge[1]]['bonding'].remove(bonding[1]) + # bonding descriptors are assumed to have bonding order 1 # unless they are specifically annotated - if not order: - order = 1 + order = int(bonding[0][-1]) self.molecule.add_edge(edge[0], edge[1], bonding=bonding, order=order) - def replace_unconsumed_bonding_descrpt(self): + def squash_atoms(self): """ - We allow multiple bonding descriptors per atom, which - however, are not always consumed. In this case the left - over bonding descriptors are replaced by hydrogen atoms. + Applies the squash operator by removing the duplicate node + adding, all edges from that node to the remaining one, and + annotate the other node with the fragid of the removed + node. """ - for meta_node in self.meta_graph.nodes: - graph = self.meta_graph.nodes[meta_node]['graph'] - bonding = nx.get_node_attributes(graph, "bonding") - for node, bondings in bonding.items(): - if bondings: - element = graph.nodes[node]['element'] - bonds = round(sum([self.molecule.edges[(node, neigh)]['order'] for neigh in\ - self.molecule.neighbors(node)])) - hcount = VALENCES[element][0] - bonds - attrs = {attr: graph.nodes[node][attr] for attr in ['fragname', 'fragid']} - attrs['element'] = 'H' - for _ in range(0, hcount): - new_node = len(self.molecule.nodes) + 1 - graph.add_edge(node, new_node) - attrs['atomname'] = "H" + str(len(graph.nodes)-1) - graph.nodes[new_node].update(attrs) - self.molecule.add_edge(node, new_node, order=1) - self.molecule.nodes[new_node].update(attrs) - # now we want to sort the atoms - sort_nodes(self.molecule) - # and redo the meta molecule - self.meta_graph = annotate_fragments(self.meta_graph, self.molecule) + bondings = nx.get_edge_attributes(self.molecule, 'bonding') + squashed = False + for edge, bonding in bondings.items(): + if not bonding[0].startswith('!'): + continue + # let's squash two nodes + node_to_keep, node_to_remove = edge + self.molecule = nx.contracted_nodes(self.molecule, + node_to_keep, + node_to_remove, + self_loops=False) + + # add the fragment id of the sequashed node + self.molecule.nodes[node_to_keep]['fragid'] += self.molecule.nodes[node_to_keep]['contraction'][node_to_remove]['fragid'] def resolve(self): + if self.cgsmiles_string is not None: self.meta_graph = read_cgsmiles(self.cgsmiles_string) @@ -222,9 +211,24 @@ def resolve(self): self.fragment_dict.update(read_fragments(self.fragment_string, all_atom=self.all_atom)) + # add disconnected fragments to graph self.resolve_disconnected_molecule() + + # connect valid bonding descriptors self.edges_from_bonding_descrpt() + + # contract atoms with squash descriptors + self.squash_atoms() + + # rebuild hydrogen in all-atom case if self.all_atom: - self.replace_unconsumed_bonding_descrpt() + rebuild_h_atoms(self.molecule) + + # sort the atoms + self.molecule = sort_nodes_by_attr(self.molecule, sort_attr=("fragid")) + + # and redo the meta molecule + self.meta_graph = annotate_fragments(self.meta_graph, + self.molecule) return self.meta_graph, self.molecule diff --git a/cgsmiles/tests/test_cgsmile_parsing.py b/cgsmiles/tests/test_cgsmile_parsing.py index 8919753..8864216 100644 --- a/cgsmiles/tests/test_cgsmile_parsing.py +++ b/cgsmiles/tests/test_cgsmile_parsing.py @@ -127,31 +127,35 @@ def test_read_cgsmiles(smile, nodes, edges): # smiple symmetric bonding ("[$]COC[$]", "COC", - {0: ["$"], 2: ["$"]}), + {0: ["$1"], 2: ["$1"]}), + # smiple symmetric bonding with more than one name + ("[$1A]COC[$1A]", + "COC", + {0: ["$1A1"], 2: ["$1A1"]}), # simple symmetric but with explicit hydrogen ("[$][CH2]O[CH2][$]", "[CH2]O[CH2]", - {0: ["$"], 2: ["$"]}), + {0: ["$1"], 2: ["$1"]}), # smiple symmetric bonding; multiple descript ("[$]COC[$][$1]", "COC", - {0: ["$"], 2: ["$", "$1"]}), + {0: ["$1"], 2: ["$1", "$11"]}), # named different bonding descriptors ("[$1]CCCC[$2]", "CCCC", - {0: ["$1"], 3: ["$2"]}), + {0: ["$11"], 3: ["$21"]}), # ring and bonding descriptors ("[$1]CC[$2]C1CCCCC1", "CCC1CCCCC1", - {0: ["$1"], 1: ["$2"]}), + {0: ["$11"], 1: ["$21"]}), # bonding descript. after branch ("C(COC[$1])[$2]CCC[$3]", "C(COC)CCC", - {0: ["$2"], 3: ["$1"], 6: ["$3"]}), + {0: ["$21"], 3: ["$11"], 6: ["$31"]}), # left rigth bonding desciptors ("[>]COC[<]", "COC", - {0: [">"], 2: ["<"]}) + {0: [">1"], 2: ["<1"]}) )) def test_strip_bonding_descriptors(big_smile, smile, bonding): new_smile, new_bonding = strip_bonding_descriptors(big_smile) @@ -161,75 +165,53 @@ def test_strip_bonding_descriptors(big_smile, smile, bonding): @pytest.mark.parametrize('fragment_str, nodes, edges',( # single fragment ("{#PEO=[$]COC[$]}", - {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$"], "element": "C"}), - (1, {"atomname": "O1", "fragname": "PEO", "element": "O"}), - (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$"], "element": "C"}), - (3, {"atomname": "H3", "fragname": "PEO", "element": "H"}), - (4, {"atomname": "H4", "fragname": "PEO", "element": "H"}), - (5, {"atomname": "H5", "fragname": "PEO", "element": "H"}), - (6, {"atomname": "H6", "fragname": "PEO", "element": "H"}), + {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3}), + (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0}), + (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3}), )}, - {"PEO": [(0, 1), (1, 2), (0, 3), (0, 4), (2, 5), (2, 6)]}), + {"PEO": [(0, 1), (1, 2)]}), # single fragment but with explicit hydrogen in smiles ("{#PEO=[$][CH2]O[CH2][$]}", - {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$"], "element": "C"}), - (1, {"atomname": "O1", "fragname": "PEO", "element": "O"}), - (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$"], "element": "C"}), - (3, {"atomname": "H3", "fragname": "PEO", "element": "H"}), - (4, {"atomname": "H4", "fragname": "PEO", "element": "H"}), - (5, {"atomname": "H5", "fragname": "PEO", "element": "H"}), - (6, {"atomname": "H6", "fragname": "PEO", "element": "H"}), + {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2}), + (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0}), + (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2}), )}, - {"PEO": [(0, 1), (1, 2), (0, 3), (0, 4), (2, 5), (2, 6)]}), + {"PEO": [(0, 1), (1, 2)]}), # test NH3 terminal ("{#AMM=N[$]}", - {"AMM": ((0, {"atomname": "N0", "fragname": "AMM", "bonding": ["$"], "element": "N"}), - (1, {"atomname": "H1", "fragname": "AMM", "element": "H"}), - (2, {"atomname": "H2", "fragname": "AMM", "element": "H"}), + {"AMM": ((0, {"atomname": "N0", "fragname": "AMM", "bonding": ["$1"], "element": "N", "hcount": 3}), )}, - {"AMM": [(0, 1), (0, 2)]}), + {"AMM": []}), # single fragment + 1 terminal (i.e. only 1 bonding descrpt ("{#PEO=[$]COC[$],#OHter=[$][OH]}", - {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$"], "element": "C"}), - (1, {"atomname": "O1", "fragname": "PEO", "element": "O"}), - (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$"], "element": "C"}), - (3, {"atomname": "H3", "fragname": "PEO", "element": "H"}), - (4, {"atomname": "H4", "fragname": "PEO", "element": "H"}), - (5, {"atomname": "H5", "fragname": "PEO", "element": "H"}), - (6, {"atomname": "H6", "fragname": "PEO", "element": "H"}), + {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3}), + (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0}), + (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3}), ), - "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$"], "element": "O"}), - (1, {"atomname": "H1", "fragname": "OHter", "element": "H"}))}, - {"PEO": [(0, 1), (1, 2), (0, 3), (0, 4), (2, 5), (2, 6)], - "OHter": [(0, 1)]}), + "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O"}),)}, + {"PEO": [(0, 1), (1, 2)], + "OHter": []}), # single fragment + 1 terminal but multiple bond descritp. # this adjust the hydrogen count ("{#PEO=[$]COC[$][$1],#OHter=[$][OH]}", - {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$"], "element": "C"}), - (1, {"atomname": "O1", "fragname": "PEO", "element": "O"}), - (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$", "$1"], "element": "C"}), - (3, {"atomname": "H3", "fragname": "PEO", "element": "H"}), - (4, {"atomname": "H4", "fragname": "PEO", "element": "H"}), - (5, {"atomname": "H5", "fragname": "PEO", "element": "H"}), + {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3}), + (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0}), + (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1", "$11"], "element": "C", "hcount": 3}), ), - "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$"], "element": "O"}), - (1, {"atomname": "H1", "fragname": "OHter", "element": "H"}))}, - {"PEO": [(0, 1), (1, 2), (0, 3), (0, 4), (2, 5)], - "OHter": [(0, 1)]}), + "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "hcount": 1}),)}, + {"PEO": [(0, 1), (1, 2)], + "OHter": []}), # single fragment + 1 terminal but multiple bond descritp. # but explicit hydrogen in the smiles string ("{#PEO=[$][CH2]O[CH2][$][$1],#OHter=[$][OH]}", - {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$"], "element": "C"}), - (1, {"atomname": "O1", "fragname": "PEO", "element": "O"}), - (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$", "$1"], "element": "C"}), - (3, {"atomname": "H3", "fragname": "PEO", "element": "H"}), - (4, {"atomname": "H4", "fragname": "PEO", "element": "H"}), - (5, {"atomname": "H5", "fragname": "PEO", "element": "H"}), + {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2}), + (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0}), + (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1", "$11"], "element": "C", "hcount": 2}), ), - "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$"], "element": "O"}), - (1, {"atomname": "H1", "fragname": "OHter", "element": "H"}))}, - {"PEO": [(0, 1), (1, 2), (0, 3), (0, 4), (2, 5)], - "OHter": [(0, 1)]}), + "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "hcount": 1}), + )}, + {"PEO": [(0, 1), (1, 2),], + "OHter": []}), )) def test_fragment_iter(fragment_str, nodes, edges): diff --git a/cgsmiles/tests/test_molecule_resolve.py b/cgsmiles/tests/test_molecule_resolve.py index 9a7ff66..2c481c8 100644 --- a/cgsmiles/tests/test_molecule_resolve.py +++ b/cgsmiles/tests/test_molecule_resolve.py @@ -24,6 +24,11 @@ {0: ['>'], 1: ['$5']}, (3, 0), ('<', '>')), + # left right with annotated > < + ({0: ['$'], 1: ['>1'], 3: ['<1']}, + {0: ['>1'], 1: ['$5']}, + (3, 0), + ('<1', '>1')), # left right selective bonding # with identifier ({0: ['$'], 1: ['>'], 3: ['<1']}, @@ -42,22 +47,34 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes): assert new_btypes == btypes -@pytest.mark.parametrize('smile, ref_nodes, ref_edges',( +@pytest.mark.parametrize('smile, ref_frags, elements, ref_edges',( # smiple linear seqeunce ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]COC[$],#OHter=[$][O]}", # 0 1 2 3 4 5 6 7 8 [('OHter', 'O H'), ('PEO', 'C O C H H H H'), # 9 10 11 12 13 14 15 16 17 ('PEO', 'C O C H H H H'), ('OHter', 'O H')], + 'O H C O C H H H H C O C H H H H O H', [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7), (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13), (11, 14), (11, 15), (11, 16), (16, 17)]), + # smiple linear seqeunce with bond-order in link + ("{[#TC1][#TC4][#TC1]}.{#TC1=[$1]=CC=[$2],#TC4=[$1]=CC=[$2]}", + # 0 1 2 3 4 5 6 7 8 9 + [('TC1', 'C C H H H H'), ('TC4', 'C C H H'), + # 10 11 12 13 14 15 + ('TC1', 'C C H H H H')], + 'C C H H H H C C H H C C H H H H', + [(0, 1), (0, 2), (1, 3), (1, 4), (1, 5), (0, 6), (6, 7), + (6, 8), (7, 9), (7, 11), (10, 11), (10, 12), (10, 13), + (10, 14), (11, 15)]), # smiple linear seqeunce unconsumed bonding descrpt ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]CO[>]C[$],#OHter=[$][O]}", # 0 1 2 3 4 5 6 7 8 [('OHter', 'O H'), ('PEO', 'C O C H H H H'), # 9 10 11 12 13 14 15 16 17 ('PEO', 'C O C H H H H'), ('OHter', 'O H')], + 'O H C O C H H H H C O C H H H H O H', [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7), (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13), (11, 14), (11, 15), (11, 16), (16, 17)]), @@ -67,23 +84,37 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes): [('OHter', 'O Na'), ('PEO', 'C O C H H H H'), # 9 10 11 12 13 14 15 16 17 ('PEO', 'C O C H H H H'), ('OHter', 'O Na')], + 'O Na C O C H H H H C O C H H H H O Na', [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7), (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13), (11, 14), (11, 15), (11, 16), (16, 17)]), - + # smiple linear seqeunce with ionic ending + ("{[#OH][#PEO]|2[#ON]}.{#PEO=[$]COC[$],#OH=[$]O,#ON=[$][O-]}", + # 0 1 2 3 4 5 6 7 8 + [('OH', 'O H'), ('PEO', 'C O C H H H H'), + # 9 10 11 12 13 14 15 16 17 + ('PEO', 'C O C H H H H'), ('ON', 'O')], + 'O H C O C H H H H C O C H H H H O', + [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7), + (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13), + (11, 14), (11, 15), (11, 16)]), # uncomsumed bonding IDs; note that this is not the same # molecule as previous test case. Here one of the OH branches # and replaces an CH2 group with CH-OH - ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[>][$1]COC[<],#OHter=[$1][O]}", + ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[>][$1A]COC[<],#OHter=[$1A][O]}", + # 0 1 2 3 4 5 6 7 8 [('OHter', 'O H'), ('PEO', 'C O C H H H H'), + # 9 10 11 12 13 14 15 16 17 ('PEO', 'C O C H H H H'), ('OHter', 'O H')], - [(0, 1), (0, 2), (2, 3), (2, 5), (2, 10), (3, 4), - (4, 6), (4, 7), (4, 17), (8, 9), (8, 11), (8, 14), - (8, 18), (9, 10), (10, 12), (10, 13), (14, 15)]), + 'O H C O C H H H H C O C H H H H O H', + [(0, 1), (0, 2), (2, 3), (2, 5), (2, 11), (3, 4), + (4, 6), (4, 7), (4, 8), (9, 10), (9, 12), (9, 13), + (10, 11), (11, 15), (11, 14), (9, 16), (16, 17)]), # simple branched sequence ("{[#Hter][#PE]([#PEO][#Hter])[#PE]([#PEO][#Hter])[#Hter]}.{#Hter=[$]H,#PE=[$]CC[$][$],#PEO=[$]COC[$]}", [('Hter', 'H'), ('PE', 'C C H H H'), ('PEO', 'C O C H H H H'), ('Hter', 'H'), ('PE', 'C C H H H'), ('PEO', 'C O C H H H H'), ('Hter', 'H'), ('Hter', 'H')], + 'H C C H H H C O C H H H H H C C H H H C O C H H H H H H', [(0, 1), (1, 2), (1, 3), (1, 4), (2, 5), (2, 6), (2, 14), (6, 7), (6, 9), (6, 10), (7, 8), (8, 11), (8, 12), (8, 13), (14, 15), (14, 16), (14, 17), (15, 18), (15, 19), (15, 27), (19, 20), (19, 22), (19, 23), (20, 21), (21, 24), (21, 25), (21, 26)]), @@ -93,6 +124,7 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes): ("{[#Hter][#PS]|2[#Hter]}.{#PS=[$]CC[$]c1ccccc1,#Hter=[$]H}", [('Hter', 'H'), ('PS', 'C C C C C C C C H H H H H H H H'), ('PS', 'C C C C C C C C H H H H H H H H'), ('Hter', 'H')], + 'H C C C C C C C C H H H H H H H H C C C C C C C C H H H H H H H H H', [(0, 1), (1, 2), (1, 9), (1, 10), (2, 3), (2, 11), (2, 17), (3, 4), (3, 8), (4, 5), (4, 12), (5, 6), (5, 13), (6, 7), (6, 14), (7, 8), (7, 15), (8, 16), (17, 18), (17, 25), @@ -107,17 +139,68 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes): # ('PE', 'C C H H H H H'), ('PE', 'C C H H H H'), ('PE', 'C C H H H'), # ('PE', 'C C H H H H'), ('PE', 'C C H H H H H'), ('PE', 'C C H H H H H')] # [, -# ( +# ( + # smiple squash operator; no unconsumed operators + ("{[#A][#B]}.{#A=OC[!],#B=[!]CC}", + # 0 1 2 3 4 1 5 3 4 6 7 8 + # note that the order here is O H then C H H + # because the C H H is shared between fragments + # so that sorting by fragment id yields this order + # the same goes for the second fragment but here + # the CH2 goes in the front because it also belongs + # to the fragment with lower fragid + [('A', 'O H C H H'), ('B', 'C H H C H H H'),], + 'O H C H H C H H H', + [(0, 1), (0, 2), (2, 3), (2, 4), (2, 5), + (5, 6), (5, 7), (5, 8)]), + # smiple squash operator; unconsumed operators + ("{[#A][#B]}.{#A=OC[!],#B=[$][!]CC}", + # 0 1 2 3 4 1 5 3 4 6 7 8 + # 0 1 2 3 4 5 6 7 11 8 9 10 + # note that the unconsumed $ triggers rebuild of a hydrogen + # which however is appended to the end of the molecule so + # making it 11 + [('A', 'O H C H H'), ('B', 'C H H C H H H'),], + 'O H C H H C H H H', + [(0, 1), (0, 2), (2, 3), (2, 4), (2, 5), + (5, 6), (5, 7), (5, 8)]), + # smiple squash operator; plus connect operator + ("{[#A][#B][#C]}.{#A=OC[!],#B=[$][!]CC,#C=[$]O}", + # 0 1 2 3 4 1 5 3 4 6 7 8 + # 0 1 2 3 4 5 6 7 11 8 9 10 + # note that the unconsumed $ triggers rebuild of a hydrogen + # which however is appended to the end of the molecule so + # making it 11 + [('A', 'O H C H'), ('B', 'C H C H H H'), ('C', 'O H')], + 'O H C H C H H H O H', + [(0, 1), (0, 2), (2, 3), (2, 4), + (4, 5), (4, 6), (4, 7), (2, 8), (8, 9)]), )) -def test_def_big_smile_parser(smile, ref_nodes, ref_edges): +def test_all_atom_resolve_molecule(smile, ref_frags, elements, ref_edges): meta_mol, molecule = MoleculeResolver(smile).resolve() -# nx.draw_networkx(meta_mol.molecule, with_labels=True, labels=nx.get_node_attributes(meta_mol.molecule, 'element')) -# plt.show() - for node, ref in zip(meta_mol.nodes, ref_nodes): + + # loop and compare fragments first + for node, ref in zip(meta_mol.nodes, ref_frags): assert meta_mol.nodes[node]['fragname'] == ref[0] block_graph = meta_mol.nodes[node]['graph'] - elements = nx.get_node_attributes(block_graph, 'element') #.values()) - sorted_elements = [elements[key] for key in sorted(elements)] + target_elements = nx.get_node_attributes(block_graph, 'element') + sorted_elements = [target_elements[key] for key in sorted(target_elements)] assert sorted_elements == ref[1].split() - assert sorted(molecule.edges) == sorted(ref_edges) + + # make the full scale reference graph + ref_graph = nx.Graph() + ref_graph.add_edges_from(ref_edges) + nx.set_node_attributes(ref_graph, + dict(zip(sorted(ref_graph.nodes), elements.split())), + 'element') + + def _ele_match(n1, n2): + return n1["element"] == n2["element"] + + print(smile) + print(ref_graph.edges) + print(molecule.edges) + assert ref_graph.edges == molecule.edges + # check that reference graph and molecule are isomorphic + assert nx.is_isomorphic(ref_graph, molecule, node_match=_ele_match)