diff --git a/cgsmiles/graph_utils.py b/cgsmiles/graph_utils.py index 500946f..64c384f 100644 --- a/cgsmiles/graph_utils.py +++ b/cgsmiles/graph_utils.py @@ -164,8 +164,11 @@ def set_atom_names_atomistic(molecule, meta_graph=None): fraglist = defaultdict(list) if meta_graph: for meta_node in meta_graph.nodes: - fraggraph = meta_graph.nodes[meta_node]['graph'] - fraglist[meta_node] += list(fraggraph.nodes) + # to catch virtual side nodes that do not have a representation + # in the atomsitic structure + fraggraph = meta_graph.nodes[meta_node].get('graph', None) + if fraggraph: + fraglist[meta_node] += list(fraggraph.nodes) else: node_to_fragid = nx.get_node_attributes(molecule, 'fragid') for node, fragids in node_to_fragid.items(): diff --git a/cgsmiles/read_cgsmiles.py b/cgsmiles/read_cgsmiles.py index f56180c..cd2c0a7 100644 --- a/cgsmiles/read_cgsmiles.py +++ b/cgsmiles/read_cgsmiles.py @@ -31,21 +31,22 @@ def _expand_branch(mol_graph, current, anchor, recipe): anchor: abc.hashable anchor to which to connect current node - recipie: list[(str, int)] + recipe: list[(str, int, int)] list storing tuples of node names and the number of times the node has to be added + and their bond order Returns ------- nx.Graph """ prev_node = anchor - for bdx, (fragname, n_mon) in enumerate(recipe): + for bdx, (fragname, n_mon, order) in enumerate(recipe): if bdx == 0: anchor = current for _ in range(0, n_mon): mol_graph.add_node(current, fragname=fragname) - mol_graph.add_edge(prev_node, current, order=1) + mol_graph.add_edge(prev_node, current, order=order) prev_node = current current += 1 @@ -53,10 +54,6 @@ def _expand_branch(mol_graph, current, anchor, recipe): prev_node = anchor return mol_graph, current, prev_node -def _get_percent(pattern, stop): - end_num = _find_next_character(pattern, ['[', ')', '(', '}'], stop) - return pattern[stop+1:end_num] - def read_cgsmiles(pattern): """ Generate a :class:`nx.Graph` from a pattern string according to the @@ -133,6 +130,10 @@ def read_cgsmiles(pattern): cycle_edges = [] # each element in the for loop matches a pattern # '[' + '#' + some alphanumeric name + ']' + symbol_to_order = {".": 0, "=": 2, "-": 1, "#": 3, "$": 4} + default_bond_order = 1 + bond_order = None + prev_bond_order = None for match in re.finditer(PATTERNS['place_holder'], pattern): start, stop = match.span() # we start a new branch when the residue is preceded by '(' @@ -142,32 +143,61 @@ def read_cgsmiles(pattern): branch_anchor.append(prev_node) # the recipe for making the branch includes the anchor; # which is hence the first residue in the list - recipes[branch_anchor[-1]] = [(mol_graph.nodes[prev_node]['fragname'], 1)] + # at this point the bond order is still 1 unless we have an expansion + recipes[branch_anchor[-1]] = [(mol_graph.nodes[prev_node]['fragname'], 1, 1)] # here we check if the atom is followed by a cycle marker # in this case we have an open cycle and close it - for token in pattern[stop:]: - # we close a cycle - if token.isdigit() and token in cycle: - cycle_edges.append((current, cycle[token])) - del cycle[token] - # we open a cycle - elif token.isdigit(): - cycle[token] = current - # we found a ring indicator - elif token == "%": - ring_marker = _get_percent(pattern, stop) - # we close the ring + ring_marker = "" + multi_ring = False + ring_bond_order = default_bond_order + for rdx, token in enumerate(pattern[stop:]): + if multi_ring and not token.isdigit(): + ring_marker = int(ring_marker[1:]) if ring_marker in cycle: - cycle_edges.append((current, cycle[ring_marker])) + cycle_edges.append((current, + cycle[ring_marker][0], + cycle[ring_marker][1])) del cycle[ring_marker] - break - # we open a new ring - cycle[_get_percent(pattern, stop)] = current - break + else: + cycle[ring_marker] = [current, ring_bond_order] + multi_ring = False + ring_marker = "" + ring_bond_order = default_bond_order + + # we open a new multi ring + if token == "%": + multi_ring = True + ring_marker = '%' + # we open a ring or close + elif token.isdigit(): + ring_marker += token + if not multi_ring: + ring_marker = int(ring_marker) + # we have a single digit marker and it is in + # cycle so we close it + if ring_marker in cycle: + cycle_edges.append((current, + cycle[ring_marker][0], + cycle[ring_marker][1])) + del cycle[ring_marker] + # the marker is not in cycle so we update cycles + else: + cycle[ring_marker] = [current, ring_bond_order] + ring_marker = "" + ring_bond_order = default_bond_order + # we found bond_order + elif token in symbol_to_order: + ring_bond_order = symbol_to_order[token] else: break + # check if there is a bond-order following the node + if stop < len(pattern) and pattern[stop+rdx-1] in '- + . = # $': + bond_order = symbol_to_order[pattern[stop+rdx-1]] + else: + bond_order = default_bond_order + # here we check if the atom is followed by a expansion character '|' # as in ... [#PEO]| if stop < len(pattern) and pattern[stop] == '|': @@ -189,7 +219,7 @@ def read_cgsmiles(pattern): # the recipe dict together with the anchor residue # and expansion number if branching: - recipes[branch_anchor[-1]].append((fragname, n_mon)) + recipes[branch_anchor[-1]].append((fragname, n_mon, prev_bond_order)) # new we add new residue as often as required connection = [] @@ -197,16 +227,19 @@ def read_cgsmiles(pattern): mol_graph.add_node(current, fragname=fragname) if prev_node is not None: - mol_graph.add_edge(prev_node, current, order=1) + mol_graph.add_edge(prev_node, current, order=prev_bond_order) + + prev_bond_order = bond_order # here we have a double edge for cycle_edge in cycle_edges: if cycle_edge in mol_graph.edges: - mol_graph.edges[cycle_edge]["order"] += 1 - else: - mol_graph.add_edge(cycle_edge[0], - cycle_edge[1], - order=1) + msg=("You define two edges between the same node." + "Use bond order symbols instead.") + raise SyntaxError(msg) + mol_graph.add_edge(cycle_edge[0], + cycle_edge[1], + order=cycle_edge[2]) prev_node = current current += 1 @@ -236,12 +269,19 @@ def read_cgsmiles(pattern): eon_a = _find_next_character(pattern, [')'], stop) # Then we check if the expansion character # is next. - if eon_a+1 < len(pattern) and pattern[eon_a+1] == "|": + if (eon_a+1 < len(pattern) and pattern[eon_a+1] == "|") or\ + (eon_a+2 < len(pattern) and pattern[eon_a+2] == "|"): + if pattern[eon_a+2] == "|": + anchor_order = symbol_to_order[pattern[eon_a+1]] + recipe = recipes[prev_node][0] + recipes[prev_node][0] = (recipe[0], recipe[1], anchor_order) + eon_a += 1 # If there is one we find the beginning # of the next branch, residue or end of the string # As before all characters inbetween are a number that # is how often the branch is expanded. - eon_b = _find_next_character(pattern, ['[', ')', '(', '}'], eon_a+1) + next_characters = ['[', ')', '(', '}'] + list(symbol_to_order.keys()) + eon_b = _find_next_character(pattern, next_characters, eon_a+1) # the outermost loop goes over how often a the branch has to be # added to the existing sequence for idx in range(0,int(pattern[eon_a+2:eon_b])-1): @@ -276,6 +316,13 @@ def read_cgsmiles(pattern): prev_anchor = ref_anchor # all branches added; then go back to the base anchor prev_node = base_anchor + #================================================ + # bond orders for after branches # + #================================================ + if pattern[eon_b] in symbol_to_order: + prev_bond_order = symbol_to_order[pattern[eon_b]] + elif eon_a+1 < len(pattern) and pattern[eon_a+1] in symbol_to_order: + prev_bond_order = symbol_to_order[pattern[eon_a+1]] # if all branches are done we need to reset the lists # when all nested branches are completed if len(branch_anchor) == 0: diff --git a/cgsmiles/resolve.py b/cgsmiles/resolve.py index a61df3e..fc50cda 100644 --- a/cgsmiles/resolve.py +++ b/cgsmiles/resolve.py @@ -1,5 +1,6 @@ import re import copy +import numpy as np import networkx as nx from .read_cgsmiles import read_cgsmiles from .read_fragments import read_fragments @@ -239,6 +240,15 @@ def resolve_disconnected_molecule(self, fragment_dict): """ for meta_node in self.meta_graph.nodes: fragname = self.meta_graph.nodes[meta_node]['fragname'] + + # we have a virtual node and skip it + if fragname not in fragment_dict: + neighbors = self.meta_graph.neighbors(meta_node) + orders = [self.meta_graph.edges[(meta_node, neigh)]["order"] for neigh in neighbors] + if not all(np.array(orders) == 0): + raise SyntaxError(f"Found node #{fragname} but no corresponding fragment.") + continue + fragment = fragment_dict[fragname] correspondence = merge_graphs(self.molecule, fragment) diff --git a/cgsmiles/sample.py b/cgsmiles/sample.py index 7dd1927..2b06de5 100644 --- a/cgsmiles/sample.py +++ b/cgsmiles/sample.py @@ -59,7 +59,7 @@ class MoleculeSampler: descriptor is used to highlight the fact that the probabilistic behaviour is driven by the bonding descriptors. - The sampler features a two level connectivity determination. First using the + The sampler features a two level connectivity determination. First using the `polymer_reactivities` an open bonding descriptor from the growing polymer molecule is selected according to the probabilities provided. Subsequently, a fragment is randomly chosen that has a matching complementary bonding diff --git a/cgsmiles/tests/test_cgsmile_parsing.py b/cgsmiles/tests/test_cgsmile_parsing.py index ccedcc0..bd47839 100644 --- a/cgsmiles/tests/test_cgsmile_parsing.py +++ b/cgsmiles/tests/test_cgsmile_parsing.py @@ -10,7 +10,7 @@ [(0, 1), (1, 2)], [1, 1]), # smiple linear sequenece with multi-edge - ("{[#PMA]1[#PEO]1}", + ("{[#PMA]=[#PEO]}", ["PMA", "PEO"], [(0, 1)], [2]), @@ -34,6 +34,41 @@ ["PMA", "PEO", "PMA"], [(0, 1), (1, 2), (0, 2)], [1, 1, 1]), + # smiple cycle sequence bond order to next + ("{[#PMA]1=[#PEO][#PMA]1}", + ["PMA", "PEO", "PMA"], + [(0, 1), (1, 2), (0, 2)], + [2, 1, 1]), + # smiple cycle sequence bond order in cycle + ("{[#PMA]=1[#PEO][#PMA]1}", + ["PMA", "PEO", "PMA"], + [(0, 1), (1, 2), (0, 2)], + [1, 1, 2]), + # smiple cycle sequence two bond orders + ("{[#PMA].1=[#PEO][#PMA]1}", + ["PMA", "PEO", "PMA"], + [(0, 1), (1, 2), (0, 2)], + [2, 1, 0]), + # smiple cycle sequence with % bond order + ("{[#PMA]=%123[#PEO][#PMA]%123}", + ["PMA", "PEO", "PMA"], + [(0, 1), (1, 2), (0, 2)], + [1, 1, 2]), + # smiple cycle mixed % and digit marker + ("{[#PMA]=1[#PEO][#PMA]%01}", + ["PMA", "PEO", "PMA"], + [(0, 1), (1, 2), (0, 2)], + [1, 1, 2]), + # smiple cycle sequence with % bond order next + ("{[#PMA]%123=[#PEO][#PMA]%123}", + ["PMA", "PEO", "PMA"], + [(0, 1), (1, 2), (0, 2)], + [2, 1, 1]), + # smiple cycle sequence with % two bond orders + ("{[#PMA]=%123.[#PEO][#PMA]%123}", + ["PMA", "PEO", "PMA"], + [(0, 1), (1, 2), (0, 2)], + [0, 1, 2]), # smiple cycle sequence with % ("{[#PMA]%123[#PEO][#PMA]%123}", ["PMA", "PEO", "PMA"], @@ -57,7 +92,7 @@ # [1, 1, 1, 1, 1, 1, 1, 1]), # smiple linear sequenece with multi-edge # in cycle - ("{[#PMA]12[#PMA][#PMA][#PEO]12}", + ("{[#PMA]=1[#PMA][#PMA][#PEO]1}", ["PMA", "PMA", "PMA", "PEO"], [(0, 1), (1, 2), (2, 3), (0, 3)], [1, 1, 1, 2]), @@ -70,6 +105,42 @@ (0, 4), (4, 5), (5, 6), (6, 7), (4, 8), (8, 9), (9, 10), (10, 11)], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), + # simple branch expension with bond orders + ("{[#PMA]([#PEO][#PEO]=[#OHter])|3}", + ["PMA", "PEO", "PEO", "OHter", + "PMA", "PEO", "PEO", "OHter", + "PMA", "PEO", "PEO", "OHter"], + [(0, 1), (1, 2), (2, 3), + (0, 4), (4, 5), (5, 6), (6, 7), + (4, 8), (8, 9), (9, 10), (10, 11)], + [1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2]), + # simple branch expension with bond orders + ("{[#PMA].([#PEO][#PEO]=[#OHter])|3}", + ["PMA", "PEO", "PEO", "OHter", + "PMA", "PEO", "PEO", "OHter", + "PMA", "PEO", "PEO", "OHter"], + [(0, 1), (1, 2), (2, 3), + (0, 4), (4, 5), (5, 6), (6, 7), + (4, 8), (8, 9), (9, 10), (10, 11)], + [0, 1, 2, 1, 0, 1, 2, 1, 0, 1, 2]), + # simple branch expension with bond orders + ("{[#PMA]([#PEO][#PEO]=[#OHter])|3.[#E]}", + ["PMA", "PEO", "PEO", "OHter", + "PMA", "PEO", "PEO", "OHter", + "PMA", "PEO", "PEO", "OHter", "E"], + [(0, 1), (1, 2), (2, 3), + (0, 4), (4, 5), (5, 6), (6, 7), + (4, 8), (8, 9), (9, 10), (10, 11), (8, 12)], + [1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 0]), + # not so simple branch expension with bond orders + ("{[#PMA]([#PEO][#PEO]=[#OHter])$|3.[#E]}", + ["PMA", "PEO", "PEO", "OHter", + "PMA", "PEO", "PEO", "OHter", + "PMA", "PEO", "PEO", "OHter", "E"], + [(0, 1), (1, 2), (2, 3), + (0, 4), (4, 5), (5, 6), (6, 7), + (4, 8), (8, 9), (9, 10), (10, 11), (8, 12)], + [1, 1, 2, 4, 1, 1, 2, 4, 1, 1, 2, 0]), # nested branched with expansion ("{[#PMA]([#PEO]|3)|2}", ["PMA", "PEO", "PEO", "PEO", diff --git a/cgsmiles/tests/test_molecule_resolve.py b/cgsmiles/tests/test_molecule_resolve.py index b907757..23818b0 100644 --- a/cgsmiles/tests/test_molecule_resolve.py +++ b/cgsmiles/tests/test_molecule_resolve.py @@ -170,7 +170,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): [(0, 1), (0, 2), (2, 3), (2, 4), (4, 5), (4, 6), (4, 7), (2, 8), (8, 9)], {}, {}), # THF like test case with double edge and squash operator - ("{[#A]1[#B]1}.{#A=[!]COC[!],#B=[!]CCCC[!]}", + ("{[#A]=[#B]}.{#A=[!]COC[!],#B=[!]CCCC[!]}", [('A', 'O C C H H H H'), ('B', 'C C H H H H C C H H H H')], 'O C C H H H H C C H H H H', @@ -185,6 +185,16 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): [(0, 1), (0, 2), (0, 3), (0, 4), (1, 5), (1, 7), (5, 9), (5, 6), (7, 13), (7, 8), (9, 11), (9, 10), (11, 13), (11, 12), (13, 14)], {}, {}), + # simple chirality assigment with rings + ("{[#GLC]}.{#GLC=C([C@@H]1[C@H]([C@@H]([C@H]([C@H](O1)O)O)O)O)O}", + # 0 1 2 3 + [('GLC', 'C C C C C C O O O O O O H H H H H H H H H H H H')], + 'C C C C C C O O O O O O H H H H H H H H H H H H', + [(0, 1), (0, 11), (0, 12), (0, 13), (1, 2), (1, 6), (1, 14), (2, 3), (2, 10), + (2, 15), (3, 4), (3, 9), (3, 16), (4, 5), (4, 8), (4, 17), (5, 6), (5, 7), (5, 18), + (7, 19), (8, 20), (9, 21), (10, 22), (11, 23)], + {1: (6, 14, 2, 0), 2: (1, 15, 3, 10), 3: (2, 16, 9, 4), + 4: (3, 17, 5, 8), 5: (4, 18, 6, 7)}, {}), # simple chirality assigment between fragments ("{[#A][#B][#C]}.{#A=O[>],#C=O[<],#B=[<]C[C@H][>]C(=O)OC}", # 0 1 2 3 @@ -195,16 +205,6 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): (2, 5), (5, 6), (5, 7), (7, 8), (7, 9), (9, 10), (10, 11), (10, 12), (10, 13), (5, 14), (14, 15)], {3: (2, 10, 4, 14)}, {}), - # simple chirality assigment with rings - ("{[#GLC]}.{#GLC=C([C@@H]1[C@H]([C@@H]([C@H]([C@H](O1)O)O)O)O)O}", - # 0 1 2 3 - [('GLC', 'C C C C C C O O O O O O H H H H H H H H H H H H')], - 'C C C C C C O O O O O O H H H H H H H H H H H H', - [(0, 1), (0, 11), (0, 12), (0, 13), (1, 2), (1, 6), (1, 14), (2, 3), (2, 10), - (2, 15), (3, 4), (3, 9), (3, 16), (4, 5), (4, 8), (4, 17), (5, 6), (5, 7), (5, 18), - (7, 19), (8, 20), (9, 21), (10, 22), (11, 23)], - {1: (6, 14, 2, 0), 2: (1, 15, 3, 10), 3: (2, 16, 9, 4), - 4: (3, 17, 5, 8), 5: (4, 18, 6, 7)}, {}), # simple chirality assigment between fragments inv ("{[#A][#B][#C]}.{#A=O[>],#C=O[<],#B=[<]C[C@@H][>]C(=O)OC}", # 0 1 2 3 @@ -229,6 +229,16 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): [(0, 1), (1, 2), (0, 3), (0, 4), (0, 5), (1, 7), (7, 6), (7, 8), (8, 9), (8, 10), (8, 11)], {}, {2: (2, 1, 6, 7, 'cis'), 7: (7, 6, 1, 2, 'cis')}), + # test skip virtual nodes + ("{[#SP4]1.2[#SP4].3[#SP1r]1.[#TC4]23}.{#SP4=OC[$]C[$]O,#SP1r=[$]OC[$]CO}", + [('SP4', 'O C C O H H H H'), ('SP4', 'O C C O H H H H'), + ('SP1r', 'O C C O H H H H')], + 'O C C O H H H H O C C O H H H H O C C O H H H H', + [(0, 1), (0, 4), (1, 2), (1, 9), (1, 5), (2, 3), (2, 16), (2, 6), + (3, 7), (8, 9), (8, 12), (9, 10), (9, 13), (10, 11), (10, 17), + (10, 14), (11, 15), (16, 17), (17, 18), (17, 20), (18, 19), + (18, 21), (18, 22), (19, 23)], + {},{}), )) def test_all_atom_resolve_molecule(smile, ref_frags, elements, ref_edges, chiral, ez): meta_mol, molecule = MoleculeResolver.from_string(smile).resolve() @@ -240,6 +250,7 @@ def test_all_atom_resolve_molecule(smile, ref_frags, elements, ref_edges, chiral block_graph = meta_mol.nodes[node]['graph'] target_elements = nx.get_node_attributes(block_graph, 'element') sorted_elements = [target_elements[key] for key in sorted(target_elements)] + print(target_elements) print("-->", sorted_elements) print("-->", ref[1].split()) assert sorted_elements == ref[1].split() @@ -295,3 +306,13 @@ def _atomname_match(n1, n2): return n1["fragname"] == n2["atomname"] assert nx.is_isomorphic(ref_graph, molecule, node_match=_atomname_match) +@pytest.mark.parametrize('cgsmiles_str, error_message',( +(("{[#A][#B]}.{#A=CC[$]}", "Found node #B but no corresponding fragment."), + ("{[#A][#B]1}.{#A=CC[$],#B=OC[$]}", "You have a dangling ring index."), + ("{[#A]1[#B]1}{#A=CC[$],#B=OC[$]}", "You define two edges between the same node. Use bond order symbols instead."), +))) +def test_syntax_errors(cgsmiles_str, error_message): + with pytest.raises(SyntaxError) as e_message: + resolver = MoleculeResolver.from_string(cgsmiles_str) + cg_mol, aa_mol = resolver.resolve() + assert e_message == error_message diff --git a/cgsmiles/tests/test_write_cgsmiles.py b/cgsmiles/tests/test_write_cgsmiles.py index 47f9a37..8bfae93 100644 --- a/cgsmiles/tests/test_write_cgsmiles.py +++ b/cgsmiles/tests/test_write_cgsmiles.py @@ -38,9 +38,9 @@ def test_write_fragments(input_string): # branched nested "{[#PE][#PMA]([#PEO][#PEO]([#OMA][#OMA]1[#OMA][#OMA]1))[#PE]}", # special cycle - "{[#PE]1[#PMA]1}", + "{[#PE]=[#PMA]}", # special triple cycle - "{[#A]12[#B]12}", + "{[#A]#[#B]}", )) def test_write_mol_graphs(input_string): mol_graph = read_cgsmiles(input_string) diff --git a/cgsmiles/write_cgsmiles.py b/cgsmiles/write_cgsmiles.py index b19f60a..34d17ce 100644 --- a/cgsmiles/write_cgsmiles.py +++ b/cgsmiles/write_cgsmiles.py @@ -86,16 +86,6 @@ def write_graph(molecule, smiles_format=False, default_element='*'): edges.add(frozenset((n_idx, n_jdx))) total_edges = set(map(frozenset, molecule.edges)) ring_edges = list(total_edges - edges) - # in cgsmiles graphs only bonds of order 1 and 2 - # exists; order 2 means we have a ring at the - # higher resolution. These orders are therefore - # represented as rings and that requires to - # add them to the ring list - if not smiles_format: - for edge in molecule.edges: - if molecule.edges[edge]['order'] != 1: - for n in range(1, molecule.edges[edge]['order']): - ring_edges.append(frozenset(edge)) atom_to_ring_idx = defaultdict(list) ring_idx_to_bond = {} @@ -123,7 +113,7 @@ def write_graph(molecule, smiles_format=False, default_element='*'): previous = predecessors[current] assert len(previous) == 1 previous = previous[0] - if smiles_format and _write_edge_symbol(molecule, previous, current): + if _write_edge_symbol(molecule, previous, current): order = molecule.edges[previous, current].get('order', 1) smiles += order_to_symbol[order]