From 9e9179434cc11aa41fa944ce7f6cc89e2596074d Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Mon, 14 Oct 2024 13:33:19 +0200 Subject: [PATCH] implement bond order syntax for CG --- cgsmiles/graph_utils.py | 7 ++- cgsmiles/read_cgsmiles.py | 83 +++++++++++++++++-------- cgsmiles/resolve.py | 11 +++- cgsmiles/sample.py | 2 +- cgsmiles/tests/test_cgsmile_parsing.py | 34 +++++++++- cgsmiles/tests/test_molecule_resolve.py | 33 ++++++---- cgsmiles/tests/test_write_cgsmiles.py | 4 +- cgsmiles/write_cgsmiles.py | 12 +--- 8 files changed, 129 insertions(+), 57 deletions(-) diff --git a/cgsmiles/graph_utils.py b/cgsmiles/graph_utils.py index 500946f..64c384f 100644 --- a/cgsmiles/graph_utils.py +++ b/cgsmiles/graph_utils.py @@ -164,8 +164,11 @@ def set_atom_names_atomistic(molecule, meta_graph=None): fraglist = defaultdict(list) if meta_graph: for meta_node in meta_graph.nodes: - fraggraph = meta_graph.nodes[meta_node]['graph'] - fraglist[meta_node] += list(fraggraph.nodes) + # to catch virtual side nodes that do not have a representation + # in the atomsitic structure + fraggraph = meta_graph.nodes[meta_node].get('graph', None) + if fraggraph: + fraglist[meta_node] += list(fraggraph.nodes) else: node_to_fragid = nx.get_node_attributes(molecule, 'fragid') for node, fragids in node_to_fragid.items(): diff --git a/cgsmiles/read_cgsmiles.py b/cgsmiles/read_cgsmiles.py index f56180c..969080f 100644 --- a/cgsmiles/read_cgsmiles.py +++ b/cgsmiles/read_cgsmiles.py @@ -53,10 +53,6 @@ def _expand_branch(mol_graph, current, anchor, recipe): prev_node = anchor return mol_graph, current, prev_node -def _get_percent(pattern, stop): - end_num = _find_next_character(pattern, ['[', ')', '(', '}'], stop) - return pattern[stop+1:end_num] - def read_cgsmiles(pattern): """ Generate a :class:`nx.Graph` from a pattern string according to the @@ -133,6 +129,10 @@ def read_cgsmiles(pattern): cycle_edges = [] # each element in the for loop matches a pattern # '[' + '#' + some alphanumeric name + ']' + symbol_to_order = {".": 0, "=": 2, "-": 1, "#": 3, "$": 4} + default_bond_order = 1 + bond_order = None + prev_bond_order = None for match in re.finditer(PATTERNS['place_holder'], pattern): start, stop = match.span() # we start a new branch when the residue is preceded by '(' @@ -146,28 +146,54 @@ def read_cgsmiles(pattern): # here we check if the atom is followed by a cycle marker # in this case we have an open cycle and close it - for token in pattern[stop:]: - # we close a cycle - if token.isdigit() and token in cycle: - cycle_edges.append((current, cycle[token])) - del cycle[token] - # we open a cycle - elif token.isdigit(): - cycle[token] = current - # we found a ring indicator - elif token == "%": - ring_marker = _get_percent(pattern, stop) - # we close the ring + ring_marker = "" + multi_ring = False + ring_bond_order = default_bond_order + for rdx, token in enumerate(pattern[stop:]): + if multi_ring and not token.isdigit(): if ring_marker in cycle: - cycle_edges.append((current, cycle[ring_marker])) + cycle_edges.append((current, + cycle[ring_marker][0], + cycle[ring_marker][1])) del cycle[ring_marker] - break - # we open a new ring - cycle[_get_percent(pattern, stop)] = current - break + else: + cycle[ring_marker] = [current, ring_bond_order] + multi_ring = False + ring_marker = "" + ring_bond_order = default_bond_order + + # we open a new multi ring + if token == "%": + multi_ring = True + ring_marker = '%' + # we open a ring or close + elif token.isdigit(): + ring_marker += token + if not multi_ring: + # we have a single digit marker and it is in + # cycle so we close it + if ring_marker in cycle: + cycle_edges.append((current, + cycle[ring_marker][0], + cycle[ring_marker][1])) + del cycle[ring_marker] + # the marker is not in cycle so we update cycles + else: + cycle[ring_marker] = [current, ring_bond_order] + ring_marker = "" + ring_bond_order = default_bond_order + # we found bond_order + elif token in symbol_to_order: + ring_bond_order = symbol_to_order[token] else: break + # check if there is a bond-order following the node + if stop < len(pattern) and pattern[stop+rdx-1] in '- + . = # $': + bond_order = symbol_to_order[pattern[stop+rdx-1]] + else: + bond_order = default_bond_order + # here we check if the atom is followed by a expansion character '|' # as in ... [#PEO]| if stop < len(pattern) and pattern[stop] == '|': @@ -197,16 +223,19 @@ def read_cgsmiles(pattern): mol_graph.add_node(current, fragname=fragname) if prev_node is not None: - mol_graph.add_edge(prev_node, current, order=1) + mol_graph.add_edge(prev_node, current, order=prev_bond_order) + + prev_bond_order = bond_order # here we have a double edge for cycle_edge in cycle_edges: if cycle_edge in mol_graph.edges: - mol_graph.edges[cycle_edge]["order"] += 1 - else: - mol_graph.add_edge(cycle_edge[0], - cycle_edge[1], - order=1) + msg=("You define two edges between the same node." + "Use bond order symbols instead.") + raise SyntaxError(msg) + mol_graph.add_edge(cycle_edge[0], + cycle_edge[1], + order=cycle_edge[2]) prev_node = current current += 1 diff --git a/cgsmiles/resolve.py b/cgsmiles/resolve.py index a61df3e..3915fc1 100644 --- a/cgsmiles/resolve.py +++ b/cgsmiles/resolve.py @@ -238,6 +238,12 @@ def resolve_disconnected_molecule(self, fragment_dict): a dict of fragment graphs """ for meta_node in self.meta_graph.nodes: + neighbors = self.meta_graph.neighbors(meta_node) + edge_orders = [self.meta_graph.edges[(meta_node, ndx)]['order'] for ndx in neighbors] + # we are dealing with a virtual node that has no projection to the + # next resolution level + if edge_orders and all([order == 0 for order in edge_orders]): + continue fragname = self.meta_graph.nodes[meta_node]['fragname'] fragment = fragment_dict[fragname] correspondence = merge_graphs(self.molecule, fragment) @@ -278,7 +284,10 @@ def edges_from_bonding_descrpt(self, all_atom=False): if the high resolution level graph has all-atom resolution default: False """ - for prev_node, node in self.meta_graph.edges: + import random + edges = list(self.meta_graph.edges) + #random.shuffle(edges) + for prev_node, node in edges: for _ in range(0, self.meta_graph.edges[(prev_node, node)]["order"]): prev_graph = self.meta_graph.nodes[prev_node]['graph'] node_graph = self.meta_graph.nodes[node]['graph'] diff --git a/cgsmiles/sample.py b/cgsmiles/sample.py index 7dd1927..2b06de5 100644 --- a/cgsmiles/sample.py +++ b/cgsmiles/sample.py @@ -59,7 +59,7 @@ class MoleculeSampler: descriptor is used to highlight the fact that the probabilistic behaviour is driven by the bonding descriptors. - The sampler features a two level connectivity determination. First using the + The sampler features a two level connectivity determination. First using the `polymer_reactivities` an open bonding descriptor from the growing polymer molecule is selected according to the probabilities provided. Subsequently, a fragment is randomly chosen that has a matching complementary bonding diff --git a/cgsmiles/tests/test_cgsmile_parsing.py b/cgsmiles/tests/test_cgsmile_parsing.py index ccedcc0..77333c9 100644 --- a/cgsmiles/tests/test_cgsmile_parsing.py +++ b/cgsmiles/tests/test_cgsmile_parsing.py @@ -10,7 +10,7 @@ [(0, 1), (1, 2)], [1, 1]), # smiple linear sequenece with multi-edge - ("{[#PMA]1[#PEO]1}", + ("{[#PMA]=[#PEO]}", ["PMA", "PEO"], [(0, 1)], [2]), @@ -34,6 +34,36 @@ ["PMA", "PEO", "PMA"], [(0, 1), (1, 2), (0, 2)], [1, 1, 1]), + # smiple cycle sequence bond order to next + ("{[#PMA]1=[#PEO][#PMA]1}", + ["PMA", "PEO", "PMA"], + [(0, 1), (1, 2), (0, 2)], + [2, 1, 1]), + # smiple cycle sequence bond order in cycle + ("{[#PMA]=1[#PEO][#PMA]1}", + ["PMA", "PEO", "PMA"], + [(0, 1), (1, 2), (0, 2)], + [1, 1, 2]), + # smiple cycle sequence two bond orders + ("{[#PMA].1=[#PEO][#PMA]1}", + ["PMA", "PEO", "PMA"], + [(0, 1), (1, 2), (0, 2)], + [2, 1, 0]), + # smiple cycle sequence with % bond order + ("{[#PMA]=%123[#PEO][#PMA]%123}", + ["PMA", "PEO", "PMA"], + [(0, 1), (1, 2), (0, 2)], + [1, 1, 2]), + # smiple cycle sequence with % bond order next + ("{[#PMA]%123=[#PEO][#PMA]%123}", + ["PMA", "PEO", "PMA"], + [(0, 1), (1, 2), (0, 2)], + [2, 1, 1]), + # smiple cycle sequence with % two bond orders + ("{[#PMA]=%123.[#PEO][#PMA]%123}", + ["PMA", "PEO", "PMA"], + [(0, 1), (1, 2), (0, 2)], + [0, 1, 2]), # smiple cycle sequence with % ("{[#PMA]%123[#PEO][#PMA]%123}", ["PMA", "PEO", "PMA"], @@ -57,7 +87,7 @@ # [1, 1, 1, 1, 1, 1, 1, 1]), # smiple linear sequenece with multi-edge # in cycle - ("{[#PMA]12[#PMA][#PMA][#PEO]12}", + ("{[#PMA]=1[#PMA][#PMA][#PEO]1}", ["PMA", "PMA", "PMA", "PEO"], [(0, 1), (1, 2), (2, 3), (0, 3)], [1, 1, 1, 2]), diff --git a/cgsmiles/tests/test_molecule_resolve.py b/cgsmiles/tests/test_molecule_resolve.py index b907757..afdbf89 100644 --- a/cgsmiles/tests/test_molecule_resolve.py +++ b/cgsmiles/tests/test_molecule_resolve.py @@ -170,7 +170,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): [(0, 1), (0, 2), (2, 3), (2, 4), (4, 5), (4, 6), (4, 7), (2, 8), (8, 9)], {}, {}), # THF like test case with double edge and squash operator - ("{[#A]1[#B]1}.{#A=[!]COC[!],#B=[!]CCCC[!]}", + ("{[#A]=[#B]}.{#A=[!]COC[!],#B=[!]CCCC[!]}", [('A', 'O C C H H H H'), ('B', 'C C H H H H C C H H H H')], 'O C C H H H H C C H H H H', @@ -185,6 +185,16 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): [(0, 1), (0, 2), (0, 3), (0, 4), (1, 5), (1, 7), (5, 9), (5, 6), (7, 13), (7, 8), (9, 11), (9, 10), (11, 13), (11, 12), (13, 14)], {}, {}), + # simple chirality assigment with rings + ("{[#GLC]}.{#GLC=C([C@@H]1[C@H]([C@@H]([C@H]([C@H](O1)O)O)O)O)O}", + # 0 1 2 3 + [('GLC', 'C C C C C C O O O O O O H H H H H H H H H H H H')], + 'C C C C C C O O O O O O H H H H H H H H H H H H', + [(0, 1), (0, 11), (0, 12), (0, 13), (1, 2), (1, 6), (1, 14), (2, 3), (2, 10), + (2, 15), (3, 4), (3, 9), (3, 16), (4, 5), (4, 8), (4, 17), (5, 6), (5, 7), (5, 18), + (7, 19), (8, 20), (9, 21), (10, 22), (11, 23)], + {1: (6, 14, 2, 0), 2: (1, 15, 3, 10), 3: (2, 16, 9, 4), + 4: (3, 17, 5, 8), 5: (4, 18, 6, 7)}, {}), # simple chirality assigment between fragments ("{[#A][#B][#C]}.{#A=O[>],#C=O[<],#B=[<]C[C@H][>]C(=O)OC}", # 0 1 2 3 @@ -195,16 +205,6 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): (2, 5), (5, 6), (5, 7), (7, 8), (7, 9), (9, 10), (10, 11), (10, 12), (10, 13), (5, 14), (14, 15)], {3: (2, 10, 4, 14)}, {}), - # simple chirality assigment with rings - ("{[#GLC]}.{#GLC=C([C@@H]1[C@H]([C@@H]([C@H]([C@H](O1)O)O)O)O)O}", - # 0 1 2 3 - [('GLC', 'C C C C C C O O O O O O H H H H H H H H H H H H')], - 'C C C C C C O O O O O O H H H H H H H H H H H H', - [(0, 1), (0, 11), (0, 12), (0, 13), (1, 2), (1, 6), (1, 14), (2, 3), (2, 10), - (2, 15), (3, 4), (3, 9), (3, 16), (4, 5), (4, 8), (4, 17), (5, 6), (5, 7), (5, 18), - (7, 19), (8, 20), (9, 21), (10, 22), (11, 23)], - {1: (6, 14, 2, 0), 2: (1, 15, 3, 10), 3: (2, 16, 9, 4), - 4: (3, 17, 5, 8), 5: (4, 18, 6, 7)}, {}), # simple chirality assigment between fragments inv ("{[#A][#B][#C]}.{#A=O[>],#C=O[<],#B=[<]C[C@@H][>]C(=O)OC}", # 0 1 2 3 @@ -229,6 +229,16 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): [(0, 1), (1, 2), (0, 3), (0, 4), (0, 5), (1, 7), (7, 6), (7, 8), (8, 9), (8, 10), (8, 11)], {}, {2: (2, 1, 6, 7, 'cis'), 7: (7, 6, 1, 2, 'cis')}), + # test skip virtual nodes + ("{[#SP4]1.2[#SP4].3[#SP1r]1.[#TC4]23}.{#SP4=OC[$]C[$]O,#SP1r=[$]OC[$]CO}", + [('SP4', 'O C C O H H H H'), ('SP4', 'O C C O H H H H'), + ('SP1r', 'O C C O H H H H')], + 'O C C O H H H H O C C O H H H H O C C O H H H H', + [(0, 1), (0, 4), (1, 2), (1, 9), (1, 5), (2, 3), (2, 16), (2, 6), + (3, 7), (8, 9), (8, 12), (9, 10), (9, 13), (10, 11), (10, 17), + (10, 14), (11, 15), (16, 17), (17, 18), (17, 20), (18, 19), + (18, 21), (18, 22), (19, 23)], + {},{}), )) def test_all_atom_resolve_molecule(smile, ref_frags, elements, ref_edges, chiral, ez): meta_mol, molecule = MoleculeResolver.from_string(smile).resolve() @@ -240,6 +250,7 @@ def test_all_atom_resolve_molecule(smile, ref_frags, elements, ref_edges, chiral block_graph = meta_mol.nodes[node]['graph'] target_elements = nx.get_node_attributes(block_graph, 'element') sorted_elements = [target_elements[key] for key in sorted(target_elements)] + print(target_elements) print("-->", sorted_elements) print("-->", ref[1].split()) assert sorted_elements == ref[1].split() diff --git a/cgsmiles/tests/test_write_cgsmiles.py b/cgsmiles/tests/test_write_cgsmiles.py index 47f9a37..8bfae93 100644 --- a/cgsmiles/tests/test_write_cgsmiles.py +++ b/cgsmiles/tests/test_write_cgsmiles.py @@ -38,9 +38,9 @@ def test_write_fragments(input_string): # branched nested "{[#PE][#PMA]([#PEO][#PEO]([#OMA][#OMA]1[#OMA][#OMA]1))[#PE]}", # special cycle - "{[#PE]1[#PMA]1}", + "{[#PE]=[#PMA]}", # special triple cycle - "{[#A]12[#B]12}", + "{[#A]#[#B]}", )) def test_write_mol_graphs(input_string): mol_graph = read_cgsmiles(input_string) diff --git a/cgsmiles/write_cgsmiles.py b/cgsmiles/write_cgsmiles.py index b19f60a..34d17ce 100644 --- a/cgsmiles/write_cgsmiles.py +++ b/cgsmiles/write_cgsmiles.py @@ -86,16 +86,6 @@ def write_graph(molecule, smiles_format=False, default_element='*'): edges.add(frozenset((n_idx, n_jdx))) total_edges = set(map(frozenset, molecule.edges)) ring_edges = list(total_edges - edges) - # in cgsmiles graphs only bonds of order 1 and 2 - # exists; order 2 means we have a ring at the - # higher resolution. These orders are therefore - # represented as rings and that requires to - # add them to the ring list - if not smiles_format: - for edge in molecule.edges: - if molecule.edges[edge]['order'] != 1: - for n in range(1, molecule.edges[edge]['order']): - ring_edges.append(frozenset(edge)) atom_to_ring_idx = defaultdict(list) ring_idx_to_bond = {} @@ -123,7 +113,7 @@ def write_graph(molecule, smiles_format=False, default_element='*'): previous = predecessors[current] assert len(previous) == 1 previous = previous[0] - if smiles_format and _write_edge_symbol(molecule, previous, current): + if _write_edge_symbol(molecule, previous, current): order = molecule.edges[previous, current].get('order', 1) smiles += order_to_symbol[order]