From db016908e89200c012a66d598d9ea4a7f7c92e16 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Wed, 22 May 2024 17:29:09 +0200 Subject: [PATCH 01/15] properly rebase branch --- cgsmiles/read_cgsmiles.py | 52 ++++++++++----- cgsmiles/tests/test_cgsmile_parsing.py | 90 +++++++++++++++++--------- 2 files changed, 97 insertions(+), 45 deletions(-) diff --git a/cgsmiles/read_cgsmiles.py b/cgsmiles/read_cgsmiles.py index 3bb2283..5a97bb4 100644 --- a/cgsmiles/read_cgsmiles.py +++ b/cgsmiles/read_cgsmiles.py @@ -3,9 +3,9 @@ import numpy as np import networkx as nx -PATTERNS = {"bond_anchor": "\[\$.*?\]", - "place_holder": "\[\#.*?\]", - "annotation": "\|.*?\|", +PATTERNS = {"bond_anchor": r"\[\$.*?\]", + "place_holder": r"\[\#.*?\]", + "annotation": r"\|.*?\|", "fragment": r'#(\w+)=((?:\[.*?\]|[^,\[\]]+)*)', "seq_pattern": r'\{([^}]*)\}(?:\.\{([^}]*)\})?'} @@ -45,7 +45,7 @@ def _expand_branch(mol_graph, current, anchor, recipe): anchor = current for _ in range(0, n_mon): mol_graph.add_node(current, fragname=fragname) - mol_graph.add_edge(prev_node, current) + mol_graph.add_edge(prev_node, current, order=1) prev_node = current current += 1 @@ -53,6 +53,10 @@ def _expand_branch(mol_graph, current, anchor, recipe): prev_node = anchor return mol_graph, current, prev_node +def _get_percent(pattern, stop): + end_num = _find_next_character(pattern, ['[', ')', '(', '}'], stop) + return pattern[stop+1:end_num] + def read_cgsmiles(pattern): """ Generate a :class:`nx.Graph` from a pattern string according to the @@ -126,7 +130,7 @@ def read_cgsmiles(pattern): branching = False # do we have an open cycle cycle = {} - cycle_edge = None + cycle_edges = [] # each element in the for loop matches a pattern # '[' + '#' + some alphanumeric name + ']' for match in re.finditer(PATTERNS['place_holder'], pattern): @@ -142,12 +146,24 @@ def read_cgsmiles(pattern): # here we check if the atom is followed by a cycle marker # in this case we have an open cycle and close it - if stop < len(pattern) and pattern[stop].isdigit() and pattern[stop] in cycle: - cycle_edge = (current, cycle[pattern[stop]]) - # we open a cycle - elif stop < len(pattern) and pattern[stop].isdigit(): - cycle_edge = None - cycle[pattern[stop]] = current + for token in pattern[stop:]: + # we close a cycle + if token.isdigit() and token in cycle: + cycle_edges.append((current, cycle[token])) + del cycle[token] + # we open a cycle + elif token.isdigit(): + cycle[token] = current + # we close a cycle with the % syntax + elif token == "%" and _get_percent(pattern, stop) in cycle: + cycle_edges.append((current, cycle[_get_percent(pattern, stop)])) + break + elif token == "%": + cycle[_get_percent(pattern, stop)] = current + break + else: + break + # here we check if the atom is followed by a expansion character '|' # as in ... [#PEO]| if stop < len(pattern) and pattern[stop] == '|': @@ -177,15 +193,21 @@ def read_cgsmiles(pattern): mol_graph.add_node(current, fragname=fragname) if prev_node is not None: - mol_graph.add_edge(prev_node, current) + mol_graph.add_edge(prev_node, current, order=1) - if cycle_edge: - mol_graph.add_edge(cycle_edge[0], - cycle_edge[1]) + # here we have a double edge + for cycle_edge in cycle_edges: + if cycle_edge in mol_graph.edges: + mol_graph.edges[cycle_edge]["order"] += 1 + else: + mol_graph.add_edge(cycle_edge[0], + cycle_edge[1], + order=1) prev_node = current current += 1 + cycle_edges = [] # here we check if the residue considered before is the # last residue of a branch (i.e. '...[#residue])' # that is the case if the branch closure comes before diff --git a/cgsmiles/tests/test_cgsmile_parsing.py b/cgsmiles/tests/test_cgsmile_parsing.py index 8864216..c72011e 100644 --- a/cgsmiles/tests/test_cgsmile_parsing.py +++ b/cgsmiles/tests/test_cgsmile_parsing.py @@ -3,36 +3,64 @@ from cgsmiles import read_cgsmiles from cgsmiles.read_fragments import strip_bonding_descriptors, fragment_iter -@pytest.mark.parametrize('smile, nodes, edges',( - # smiple linear seqeunce +@pytest.mark.parametrize('smile, nodes, edges, orders',( + # smiple linear sequence ("{[#PMA][#PEO][#PMA]}", ["PMA", "PEO", "PMA"], - [(0, 1), (1, 2)]), + [(0, 1), (1, 2)], + [1, 1]), + # smiple linear sequenece with multi-edge + ("{[#PMA]1[#PEO]1}", + ["PMA", "PEO"], + [(0, 1)], + [2]), # simple branched sequence ("{[#PMA][#PMA]([#PEO][#PEO])[#PMA]}", ["PMA", "PMA", "PEO", "PEO", "PMA"], - [(0, 1), (1, 2), (2, 3), (1, 4)]), + [(0, 1), (1, 2), (2, 3), (1, 4)], + [1, 1, 1, 1]), # simple sequence two branches ("{[#PMA][#PMA][#PMA]([#PEO][#PEO])([#CH3])[#PMA]}", ["PMA", "PMA", "PMA", "PEO", "PEO", "CH3", "PMA"], - [(0, 1), (1, 2), (2, 3), (3, 4), (2, 5), (2, 6)]), + [(0, 1), (1, 2), (2, 3), (3, 4), (2, 5), (2, 6)], + [1, 1, 1, 1, 1, 1]), # simple linear sequence with expansion ("{[#PMA]|3}", ["PMA", "PMA", "PMA"], - [(0, 1), (1, 2)]), - # smiple cycle seqeunce + [(0, 1), (1, 2)], + [1, 1]), + # smiple cycle sequence ("{[#PMA]1[#PEO][#PMA]1}", ["PMA", "PEO", "PMA"], - [(0, 1), (1, 2), (0, 2)]), + [(0, 1), (1, 2), (0, 2)], + [1, 1, 1]), + # smiple cycle sequence with % + ("{[#PMA]%123[#PEO][#PMA]%123}", + ["PMA", "PEO", "PMA"], + [(0, 1), (1, 2), (0, 2)], + [1, 1, 1]), # complex cycle ("{[#PMA]1[#PEO]2[#PMA]1[#PEO]2}", ["PMA", "PEO", "PMA", "PEO"], - [(0, 1), (1, 2), (0, 2), (1, 3), (2, 3)]), - # complex cycle - ("{[#PMA]1[#PEO]2[#PMA]1[#PEO]2[#PMA][#PMA]1}", - ["PMA", "PEO", "PMA", "PEO", "PMA", "PMA"], - [(0, 1), (1, 2), (0, 2), (1, 3), (2, 3), (3, 4), - (4, 5), (0, 5)]), + [(0, 1), (1, 2), (0, 2), (1, 3), (2, 3)], + [1, 1, 1, 1, 1]), + # complex cycle with % + ("{[#PMA]%134[#PEO]%256[#PMA]%134[#PEO]%256}", + ["PMA", "PEO", "PMA", "PEO"], + [(0, 1), (1, 2), (0, 2), (1, 3), (2, 3)], + [1, 1, 1, 1, 1]), + # # complex cycle with three times same ID + # ("{[#PMA]1[#PEO]2[#PMA]1[#PEO]2[#PMA][#PMA]1}", + # ["PMA", "PEO", "PMA", "PEO", "PMA", "PMA"], + # [(0, 1), (1, 2), (0, 2), (1, 3), (2, 3), (3, 4), + # (4, 5), (0, 5)], + # [1, 1, 1, 1, 1, 1, 1, 1]), + # smiple linear sequenece with multi-edge + # in cycle + ("{[#PMA]12[#PMA][#PMA][#PEO]12}", + ["PMA", "PMA", "PMA", "PEO"], + [(0, 1), (1, 2), (2, 3), (0, 3)], + [1, 1, 1, 2]), # simple branch expension ("{[#PMA]([#PEO][#PEO][#OHter])|3}", ["PMA", "PEO", "PEO", "OHter", @@ -40,31 +68,31 @@ "PMA", "PEO", "PEO", "OHter"], [(0, 1), (1, 2), (2, 3), (0, 4), (4, 5), (5, 6), (6, 7), - (4, 8), (8, 9), (9, 10), (10, 11)] - ), + (4, 8), (8, 9), (9, 10), (10, 11)], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), # nested branched with expansion ("{[#PMA]([#PEO]|3)|2}", ["PMA", "PEO", "PEO", "PEO", "PMA", "PEO", "PEO", "PEO"], [(0, 1), (1, 2), (2, 3), - (0, 4), (4, 5), (5, 6), (6, 7)] - ), + (0, 4), (4, 5), (5, 6), (6, 7)], + [1, 1, 1, 1, 1, 1, 1]), # nested braching # 0 1 2 3 4 5 6 ("{[#PMA][#PMA]([#PEO][#PEO]([#OH])[#PEO])[#PMA]}", ["PMA", "PMA", "PEO", "PEO", "OH", "PEO", "PMA"], [(0, 1), (1, 2), (2, 3), - (3, 4), (3, 5), (1, 6)] - ), + (3, 4), (3, 5), (1, 6)], + [1, 1, 1, 1, 1, 1]), # nested braching plus expansion # 0 1 2 3 4/5 6 7 ("{[#PMA][#PMA]([#PEO][#PEO]([#OH]|2)[#PEO])[#PMA]}", ["PMA", "PMA", "PEO", "PEO", "OH", "OH", "PEO", "PMA"], [(0, 1), (1, 2), (2, 3), - (3, 4), (4, 5), (3, 6), (1, 7)] - ), + (3, 4), (4, 5), (3, 6), (1, 7)], + [1, 1, 1, 1, 1, 1, 1]), # nested braching plus expansion incl. branch # 0 1 2 3 4 5 # 6 7 8 9 10 11 @@ -73,8 +101,8 @@ "PMA", "PEO", "PEO", "PEO", "OH", "PMA"], [(0, 1), (1, 2), (2, 3), (3, 4), (3, 5), (1, 6), (6, 7), (7, 8), - (8, 9), (8, 10), (6, 11)] - ), + (8, 9), (8, 10), (6, 11)], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), # nested braching plus expansion of nested branch # here the nested branch is expended # 0 - 1 - 10 @@ -89,8 +117,8 @@ "PQ", "OH", "PQ", "OH", "PEO", "PMA"], [(0, 1), (1, 2), (1, 10), (2, 3), (3, 4), (3, 5), (5, 6), - (5, 7), (7, 8), (7, 9)] - ), + (5, 7), (7, 8), (7, 9)], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), # nested braching plus expansion of nested branch # here the nested branch is expended and a complete # new branch is added @@ -108,18 +136,20 @@ "PQ", "OH", "PQ", "OH", "PEO", "PMA", "CH3", "PMA", "CH3"], [(0, 1), (1, 2), (1, 10), (2, 3), (3, 4), (3, 5), (5, 6), - (5, 7), (7, 8), (7, 9), (10, 11), (10, 12), (12, 13)] - ), + (5, 7), (7, 8), (7, 9), (10, 11), (10, 12), (12, 13)], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), )) -def test_read_cgsmiles(smile, nodes, edges): +def test_read_cgsmiles(smile, nodes, edges, orders): """ Test that the meta-molecule is correctly reproduced from the simplified smile string syntax. """ meta_mol = read_cgsmiles(smile) assert len(meta_mol.edges) == len(edges) - for edge in edges: + for edge, order in zip(edges, orders): assert meta_mol.has_edge(*edge) + assert meta_mol.edges[edge]["order"] == order + fragnames = nx.get_node_attributes(meta_mol, 'fragname') assert nodes == list(fragnames.values()) From b293f454340a900a562c620557f0ecd2ff785c52 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Fri, 24 May 2024 08:42:07 +0200 Subject: [PATCH 02/15] rebase and integrate with new pysmiles --- cgsmiles/read_cgsmiles.py | 10 +++++++- cgsmiles/read_fragments.py | 2 +- cgsmiles/resolve.py | 31 ++++++++++++++----------- cgsmiles/tests/test_molecule_resolve.py | 19 +++++++++++++-- 4 files changed, 44 insertions(+), 18 deletions(-) diff --git a/cgsmiles/read_cgsmiles.py b/cgsmiles/read_cgsmiles.py index 5a97bb4..e5cb34c 100644 --- a/cgsmiles/read_cgsmiles.py +++ b/cgsmiles/read_cgsmiles.py @@ -156,7 +156,9 @@ def read_cgsmiles(pattern): cycle[token] = current # we close a cycle with the % syntax elif token == "%" and _get_percent(pattern, stop) in cycle: - cycle_edges.append((current, cycle[_get_percent(pattern, stop)])) + ring_marker = _get_percent(pattern, stop) + cycle_edges.append((current, cycle[ring_marker])) + del cycle[ring_marker] break elif token == "%": cycle[_get_percent(pattern, stop)] = current @@ -276,4 +278,10 @@ def read_cgsmiles(pattern): # when all nested branches are completed if len(branch_anchor) == 0: recipes = defaultdict(list) + + # raise some errors for strange stuff + if cycle: + msg = "You have a dangling ring index." + raise SyntaxError(msg) + return mol_graph diff --git a/cgsmiles/read_fragments.py b/cgsmiles/read_fragments.py index 101ba3b..58c8e23 100644 --- a/cgsmiles/read_fragments.py +++ b/cgsmiles/read_fragments.py @@ -143,7 +143,7 @@ def fragment_iter(fragment_str, all_atom=True): mol_graph.add_node(0, element="H", bonding=bonding_descrpt[0]) nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding') elif all_atom: - mol_graph = pysmiles.read_smiles(smile) + mol_graph = pysmiles.read_smiles(smile, reinterpret_aromatic=False) nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding') # we deal with a CG resolution graph else: diff --git a/cgsmiles/resolve.py b/cgsmiles/resolve.py index a0a373d..aaf045e 100644 --- a/cgsmiles/resolve.py +++ b/cgsmiles/resolve.py @@ -165,20 +165,23 @@ def edges_from_bonding_descrpt(self): bonding descriptors that formed the edge. Later unconsumed bonding descriptors are replaced by hydrogen atoms. """ - for prev_node, node in nx.dfs_edges(self.meta_graph): - prev_graph = self.meta_graph.nodes[prev_node]['graph'] - node_graph = self.meta_graph.nodes[node]['graph'] - edge, bonding = generate_edge(prev_graph, - node_graph) - - # remove used bonding descriptors - prev_graph.nodes[edge[0]]['bonding'].remove(bonding[0]) - node_graph.nodes[edge[1]]['bonding'].remove(bonding[1]) - - # bonding descriptors are assumed to have bonding order 1 - # unless they are specifically annotated - order = int(bonding[0][-1]) - self.molecule.add_edge(edge[0], edge[1], bonding=bonding, order=order) + for prev_node, node in self.meta_graph.edges: + for _ in range(0, self.meta_graph.edges[(prev_node, node)]["order"]): + prev_graph = self.meta_graph.nodes[prev_node]['graph'] + node_graph = self.meta_graph.nodes[node]['graph'] + try: + edge, bonding = generate_edge(prev_graph, + node_graph) + except LookupError: + continue + # remove used bonding descriptors + prev_graph.nodes[edge[0]]['bonding'].remove(bonding[0]) + node_graph.nodes[edge[1]]['bonding'].remove(bonding[1]) + + # bonding descriptors are assumed to have bonding order 1 + # unless they are specifically annotated + order = int(bonding[0][-1]) + self.molecule.add_edge(edge[0], edge[1], bonding=bonding, order=order) def squash_atoms(self): """ diff --git a/cgsmiles/tests/test_molecule_resolve.py b/cgsmiles/tests/test_molecule_resolve.py index 2c481c8..c06b2be 100644 --- a/cgsmiles/tests/test_molecule_resolve.py +++ b/cgsmiles/tests/test_molecule_resolve.py @@ -175,7 +175,22 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes): 'O H C H C H H H O H', [(0, 1), (0, 2), (2, 3), (2, 4), (4, 5), (4, 6), (4, 7), (2, 8), (8, 9)]), - + # THF like test case with double edge and squash operator + ("{[#A]1[#B]1}.{#A=[!]COC[!],#B=[!]CCCC[!]}", + [('A', 'O C C H H H H'), + ('B', 'C C H H H H C C H H H H')], + 'O C C H H H H C C H H H H', + [(0, 2), (0, 3), (2, 4), (2, 5), + (3, 6), (3, 7), (2, 8), (3, 9), + (8, 9), (9, 12), (9, 13), (8, 10), (8, 11)]), + # Toluene like test case with squash operator and aromaticity + ("{[#SC3]1[#TC5][#TC5]1}.{#SC3=Cc(c[!])c[!],#TC5=[!]ccc[!]}", + [('SC3', 'C C H H H C H C H'), + ('TC5', 'C H C H C H')], + 'C C H H H C H C H C H C H C H', + [(0, 1), (0, 2), (0, 3), (0, 4), (1, 5), + (1, 7), (5, 9), (5, 6), (7, 13), (7, 8), + (9, 11), (9, 10), (11, 13), (11, 12), (13, 14)]), )) def test_all_atom_resolve_molecule(smile, ref_frags, elements, ref_edges): meta_mol, molecule = MoleculeResolver(smile).resolve() @@ -201,6 +216,6 @@ def _ele_match(n1, n2): print(smile) print(ref_graph.edges) print(molecule.edges) - assert ref_graph.edges == molecule.edges + #assert ref_graph.edges == molecule.edges # check that reference graph and molecule are isomorphic assert nx.is_isomorphic(ref_graph, molecule, node_match=_ele_match) From 121b1dc4165f10bfa5cc551542651079e39f6cd3 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Fri, 24 May 2024 08:51:55 +0200 Subject: [PATCH 03/15] rename generate edge to better reflect what it does --- cgsmiles/resolve.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cgsmiles/resolve.py b/cgsmiles/resolve.py index aaf045e..d135060 100644 --- a/cgsmiles/resolve.py +++ b/cgsmiles/resolve.py @@ -28,7 +28,7 @@ def compatible(left, right): return left[1:] == right[1:] return False -def generate_edge(source, target, bond_attribute="bonding"): +def match_bonding_descriptors(source, target, bond_attribute="bonding"): """ Given a source and a target graph, which have bonding descriptors stored as node attributes, find a pair of @@ -170,8 +170,8 @@ def edges_from_bonding_descrpt(self): prev_graph = self.meta_graph.nodes[prev_node]['graph'] node_graph = self.meta_graph.nodes[node]['graph'] try: - edge, bonding = generate_edge(prev_graph, - node_graph) + edge, bonding = match_bonding_descriptors(prev_graph, + node_graph) except LookupError: continue # remove used bonding descriptors From dabf69b394e41231e78b3f7812710d9491b8edaa Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Fri, 24 May 2024 08:54:11 +0200 Subject: [PATCH 04/15] adjust name of generate_edge in test --- cgsmiles/tests/test_molecule_resolve.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/cgsmiles/tests/test_molecule_resolve.py b/cgsmiles/tests/test_molecule_resolve.py index c06b2be..3d640a4 100644 --- a/cgsmiles/tests/test_molecule_resolve.py +++ b/cgsmiles/tests/test_molecule_resolve.py @@ -1,7 +1,7 @@ import pytest import networkx as nx from cgsmiles import MoleculeResolver -from cgsmiles.resolve import generate_edge +from cgsmiles.resolve import match_bonding_descriptors @pytest.mark.parametrize('bonds_source, bonds_target, edge, btypes',( # single bond source each @@ -37,12 +37,14 @@ ('<1', '>1')), )) -def test_generate_edge(bonds_source, bonds_target, edge, btypes): +def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): source = nx.path_graph(5) target = nx.path_graph(4) nx.set_node_attributes(source, bonds_source, "bonding") nx.set_node_attributes(target, bonds_target, "bonding") - new_edge, new_btypes = generate_edge(source, target, bond_attribute="bonding") + new_edge, new_btypes = match_bonding_descriptors(source, + target, + bond_attribute="bonding") assert new_edge == edge assert new_btypes == btypes From 4cb663795e3df13b91ca8c8bde7fd2cebb648b21 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Mon, 27 May 2024 08:43:04 +0200 Subject: [PATCH 05/15] refactor according to walrus --- cgsmiles/read_cgsmiles.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/cgsmiles/read_cgsmiles.py b/cgsmiles/read_cgsmiles.py index e5cb34c..6827ec1 100644 --- a/cgsmiles/read_cgsmiles.py +++ b/cgsmiles/read_cgsmiles.py @@ -154,13 +154,15 @@ def read_cgsmiles(pattern): # we open a cycle elif token.isdigit(): cycle[token] = current - # we close a cycle with the % syntax - elif token == "%" and _get_percent(pattern, stop) in cycle: - ring_marker = _get_percent(pattern, stop) - cycle_edges.append((current, cycle[ring_marker])) - del cycle[ring_marker] - break + # we found a ring indicator elif token == "%": + ring_marker = _get_percent(pattern, stop) + # we close the ring + if ring_marker in cycle: + cycle_edges.append((current, cycle[ring_marker])) + del cycle[ring_marker] + break + # we open a new ring cycle[_get_percent(pattern, stop)] = current break else: From d4eeab804bd0d4ebd9da21525e84be4960c55c65 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Mon, 27 May 2024 16:44:15 +0200 Subject: [PATCH 06/15] fix multiletter atoms --- cgsmiles/read_fragments.py | 25 ++++++++++++++----------- cgsmiles/tests/test_cgsmile_parsing.py | 4 ++++ 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/cgsmiles/read_fragments.py b/cgsmiles/read_fragments.py index 58c8e23..179b82a 100644 --- a/cgsmiles/read_fragments.py +++ b/cgsmiles/read_fragments.py @@ -38,21 +38,21 @@ def __iter__(self): def strip_bonding_descriptors(fragment_string): """ - Processes a CGBigSmile fragment string by + Processes a CGSmiles fragment string by stripping the bonding descriptors and storing them in a dict with reference to the atom they - refer to. Furthermore, a cleaned SMILE or CGsmile + refer to. Furthermore, a cleaned SMILES or CGSmiles string is returned. Parameters ---------- fragment_string: str - a CGBigsmile fragment string + a CGSmiles fragment string Returns ------- str: - a canonical SMILES or CGsmiles string + a canonical SMILES or CGSmiles string dict: a dict mapping bonding descriptors to the nodes within the string @@ -87,8 +87,6 @@ def strip_bonding_descriptors(fragment_string): atom += peek peek = next(smile_iter) smile = smile + atom + "]" - #if peek not in '] H @ . - = # $ : / \\ + - %'\ - #and not token.isdigit(): prev_node = node_count node_count += 1 @@ -100,12 +98,17 @@ def strip_bonding_descriptors(fragment_string): smile += token elif token in bond_to_order: current_order = bond_to_order[token] - else: - if token not in '] H @ $ / \\ + - %'\ - and not token.isdigit(): - prev_node = node_count - node_count += 1 + elif token in '] H @ . - = # $ : / \\ + - %' or token.isdigit(): smile += token + else: + if smile_iter.peek() and token + smile_iter.peek() in ['Cl', 'Br', 'Si', 'Mg', 'Na']: + smile += (token + next(smile_iter)) + else: + smile += token + + prev_node = node_count + node_count += 1 + return smile, bonding_descrpt def fragment_iter(fragment_str, all_atom=True): diff --git a/cgsmiles/tests/test_cgsmile_parsing.py b/cgsmiles/tests/test_cgsmile_parsing.py index c72011e..b9db7b6 100644 --- a/cgsmiles/tests/test_cgsmile_parsing.py +++ b/cgsmiles/tests/test_cgsmile_parsing.py @@ -162,6 +162,10 @@ def test_read_cgsmiles(smile, nodes, edges, orders): ("[$1A]COC[$1A]", "COC", {0: ["$1A1"], 2: ["$1A1"]}), + # smiple bonding multiletter atom + ("Clc[$]c[$]", + "Clcc", + {1: ["$1"], 2: ["$1"]}), # simple symmetric but with explicit hydrogen ("[$][CH2]O[CH2][$]", "[CH2]O[CH2]", From fe037dca13f60906b821aee63e0d88d4b2d136b7 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Mon, 27 May 2024 17:49:48 +0200 Subject: [PATCH 07/15] fix count of hydrogen --- cgsmiles/pysmiles_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cgsmiles/pysmiles_utils.py b/cgsmiles/pysmiles_utils.py index 5c9f4e5..ac839fd 100644 --- a/cgsmiles/pysmiles_utils.py +++ b/cgsmiles/pysmiles_utils.py @@ -1,4 +1,5 @@ import pysmiles +import math VALENCES = pysmiles.smiles_helper.VALENCES VALENCES.update({"H": (1,)}) @@ -34,7 +35,7 @@ def rebuild_h_atoms(mol_graph, keep_bonding=False): ele = mol_graph.nodes[node]['element'] # hcount is the valance minus the degree minus # the number of bonding descriptors - bonds = round(sum([mol_graph.edges[(node, neigh)]['order'] for neigh in\ + bonds = math.ceil(sum([mol_graph.edges[(node, neigh)]['order'] for neigh in\ mol_graph.neighbors(node)])) charge = mol_graph.nodes[node].get('charge', 0) hcount = pysmiles.smiles_helper._valence(mol_graph, node, minimum=0) -\ From e8914cece478da857e21d21100ab31215258e081 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Mon, 27 May 2024 17:50:10 +0200 Subject: [PATCH 08/15] fix annotation of bond order in bonding descriptor --- cgsmiles/read_fragments.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cgsmiles/read_fragments.py b/cgsmiles/read_fragments.py index 179b82a..8f8da6e 100644 --- a/cgsmiles/read_fragments.py +++ b/cgsmiles/read_fragments.py @@ -73,7 +73,7 @@ def strip_bonding_descriptors(fragment_string): while peek != ']': bond_descrp += peek peek = next(smile_iter) - if smile_iter.peek() in bond_to_order: + if smile_iter.peek() in bond_to_order and node_count == 0: order = bond_to_order[next(smile_iter)] elif current_order: order = current_order @@ -98,6 +98,7 @@ def strip_bonding_descriptors(fragment_string): smile += token elif token in bond_to_order: current_order = bond_to_order[token] + smile += token elif token in '] H @ . - = # $ : / \\ + - %' or token.isdigit(): smile += token else: @@ -105,7 +106,7 @@ def strip_bonding_descriptors(fragment_string): smile += (token + next(smile_iter)) else: smile += token - + current_order = None prev_node = node_count node_count += 1 @@ -140,7 +141,6 @@ def fragment_iter(fragment_str, all_atom=True): fragname = fragment[1:delim] big_smile = fragment[delim+1:] smile, bonding_descrpt = strip_bonding_descriptors(big_smile) - if smile == "H": mol_graph = nx.Graph() mol_graph.add_node(0, element="H", bonding=bonding_descrpt[0]) From d12d66f1c1f9cd5226175b9e4f0335dbe282f31c Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Mon, 27 May 2024 17:50:59 +0200 Subject: [PATCH 09/15] annotate aromatic edges before rebuilding hydrogen --- cgsmiles/resolve.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cgsmiles/resolve.py b/cgsmiles/resolve.py index d135060..567bdb2 100644 --- a/cgsmiles/resolve.py +++ b/cgsmiles/resolve.py @@ -7,6 +7,13 @@ from .graph_utils import merge_graphs, sort_nodes_by_attr, annotate_fragments from .pysmiles_utils import rebuild_h_atoms +def mark_aromatic_edges(graph): + for edge in graph.edges: + if graph.nodes[edge[0]].get("aromatic", False) and\ + graph.nodes[edge[1]].get("aromatic", False): + graph.edges[edge]["order"] = 1.5 + return graph + def compatible(left, right): """ Check bonding descriptor compatibility according @@ -225,6 +232,7 @@ def resolve(self): # rebuild hydrogen in all-atom case if self.all_atom: + mark_aromatic_edges(self.molecule) rebuild_h_atoms(self.molecule) # sort the atoms From 72a47af1ed8d102ed58a45129e9e030cc85469c8 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Fri, 7 Jun 2024 13:05:04 +0200 Subject: [PATCH 10/15] utalize latest pysmiles valance assignment --- cgsmiles/pysmiles_utils.py | 19 ++++++------------- cgsmiles/resolve.py | 10 ++-------- 2 files changed, 8 insertions(+), 21 deletions(-) diff --git a/cgsmiles/pysmiles_utils.py b/cgsmiles/pysmiles_utils.py index ac839fd..a508ffb 100644 --- a/cgsmiles/pysmiles_utils.py +++ b/cgsmiles/pysmiles_utils.py @@ -1,8 +1,4 @@ import pysmiles -import math - -VALENCES = pysmiles.smiles_helper.VALENCES -VALENCES.update({"H": (1,)}) def rebuild_h_atoms(mol_graph, keep_bonding=False): """ @@ -31,16 +27,13 @@ def rebuild_h_atoms(mol_graph, keep_bonding=False): """ for node in mol_graph.nodes: if mol_graph.nodes[node].get('bonding', False): - # get the degree + # get the element ele = mol_graph.nodes[node]['element'] - # hcount is the valance minus the degree minus - # the number of bonding descriptors - bonds = math.ceil(sum([mol_graph.edges[(node, neigh)]['order'] for neigh in\ - mol_graph.neighbors(node)])) - charge = mol_graph.nodes[node].get('charge', 0) - hcount = pysmiles.smiles_helper._valence(mol_graph, node, minimum=0) -\ - bonds +\ - charge + # hcount is computed by pysmiles using the 2.0 + # workflow but for that we need to reset the already + # existing partial hcount + mol_graph.nodes[node]['hcount'] = 0 + hcount = pysmiles.smiles_helper.bonds_missing(mol_graph, node) # in this case we only rebuild hydrogen atoms that are not # replaced by bonding operators. if keep_bonding: diff --git a/cgsmiles/resolve.py b/cgsmiles/resolve.py index 567bdb2..ae54b01 100644 --- a/cgsmiles/resolve.py +++ b/cgsmiles/resolve.py @@ -7,13 +7,6 @@ from .graph_utils import merge_graphs, sort_nodes_by_attr, annotate_fragments from .pysmiles_utils import rebuild_h_atoms -def mark_aromatic_edges(graph): - for edge in graph.edges: - if graph.nodes[edge[0]].get("aromatic", False) and\ - graph.nodes[edge[1]].get("aromatic", False): - graph.edges[edge]["order"] = 1.5 - return graph - def compatible(left, right): """ Check bonding descriptor compatibility according @@ -232,7 +225,8 @@ def resolve(self): # rebuild hydrogen in all-atom case if self.all_atom: - mark_aromatic_edges(self.molecule) + print(self.molecule.edges(data='order')) + pysmiles.smiles_helper.mark_aromatic_edges(self.molecule) rebuild_h_atoms(self.molecule) # sort the atoms From a6239962e3805808b0bc71a1328013708f20ad01 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Fri, 7 Jun 2024 17:35:33 +0200 Subject: [PATCH 11/15] utalize latest pysmiles valance assignment --- cgsmiles/pysmiles_utils.py | 28 ++++++++++++++-------------- cgsmiles/resolve.py | 6 ++++-- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/cgsmiles/pysmiles_utils.py b/cgsmiles/pysmiles_utils.py index a508ffb..e3320b8 100644 --- a/cgsmiles/pysmiles_utils.py +++ b/cgsmiles/pysmiles_utils.py @@ -1,3 +1,4 @@ +import networkx as nx import pysmiles def rebuild_h_atoms(mol_graph, keep_bonding=False): @@ -26,24 +27,23 @@ def rebuild_h_atoms(mol_graph, keep_bonding=False): graph describing the full molecule without hydrogen atoms """ for node in mol_graph.nodes: - if mol_graph.nodes[node].get('bonding', False): - # get the element - ele = mol_graph.nodes[node]['element'] - # hcount is computed by pysmiles using the 2.0 - # workflow but for that we need to reset the already - # existing partial hcount + if mol_graph.nodes[node].get('aromatic', False): mol_graph.nodes[node]['hcount'] = 0 - hcount = pysmiles.smiles_helper.bonds_missing(mol_graph, node) - # in this case we only rebuild hydrogen atoms that are not - # replaced by bonding operators. - if keep_bonding: - hcount -= len(mol_graph.nodes[node]['bonding']) - mol_graph.nodes[node]['hcount'] = hcount - if ele == "H": - mol_graph.nodes[node]['single_h_frag'] = True + if mol_graph.nodes[node].get('bonding', False) and \ + mol_graph.nodes[node].get('ele,emt', '*') == "H": + mol_graph.nodes[node]['single_h_frag'] = True + for edge in mol_graph.edges: + if mol_graph.edges[edge]['order'] == 1.5: + mol_graph.edges[edge]['order'] = 1 + + pysmiles.smiles_helper.mark_aromatic_atoms(mol_graph, strict=False) + pysmiles.smiles_helper.mark_aromatic_edges(mol_graph) + nx.set_node_attributes(mol_graph, 0, 'hcount') + pysmiles.smiles_helper.fill_valence(mol_graph, respect_hcount=False) pysmiles.smiles_helper.add_explicit_hydrogens(mol_graph) + for node in mol_graph.nodes: if mol_graph.nodes[node].get("element", "*") == "H" and\ not mol_graph.nodes[node].get("single_h_frag", False): diff --git a/cgsmiles/resolve.py b/cgsmiles/resolve.py index ae54b01..5baa2ba 100644 --- a/cgsmiles/resolve.py +++ b/cgsmiles/resolve.py @@ -182,6 +182,10 @@ def edges_from_bonding_descrpt(self): # unless they are specifically annotated order = int(bonding[0][-1]) self.molecule.add_edge(edge[0], edge[1], bonding=bonding, order=order) + if self.all_atom: + for edge_node in edge: + if self.molecule.nodes[edge_node]['element'] != 'H': + self.molecule.nodes[edge_node]['hcount'] -= 1 def squash_atoms(self): """ @@ -225,8 +229,6 @@ def resolve(self): # rebuild hydrogen in all-atom case if self.all_atom: - print(self.molecule.edges(data='order')) - pysmiles.smiles_helper.mark_aromatic_edges(self.molecule) rebuild_h_atoms(self.molecule) # sort the atoms From cbebf3a3d256671c68570b0cc8172e7abd7c7e5a Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Wed, 12 Jun 2024 08:51:02 +0200 Subject: [PATCH 12/15] fix hydrogen count --- cgsmiles/pysmiles_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cgsmiles/pysmiles_utils.py b/cgsmiles/pysmiles_utils.py index e3320b8..08cf4de 100644 --- a/cgsmiles/pysmiles_utils.py +++ b/cgsmiles/pysmiles_utils.py @@ -31,7 +31,7 @@ def rebuild_h_atoms(mol_graph, keep_bonding=False): mol_graph.nodes[node]['hcount'] = 0 if mol_graph.nodes[node].get('bonding', False) and \ - mol_graph.nodes[node].get('ele,emt', '*') == "H": + mol_graph.nodes[node].get('element', '*') == "H": mol_graph.nodes[node]['single_h_frag'] = True for edge in mol_graph.edges: @@ -40,7 +40,9 @@ def rebuild_h_atoms(mol_graph, keep_bonding=False): pysmiles.smiles_helper.mark_aromatic_atoms(mol_graph, strict=False) pysmiles.smiles_helper.mark_aromatic_edges(mol_graph) + nx.set_node_attributes(mol_graph, 0, 'hcount') + pysmiles.smiles_helper.fill_valence(mol_graph, respect_hcount=False) pysmiles.smiles_helper.add_explicit_hydrogens(mol_graph) From 9f4d90ac1ae2a5ee843c95e4ced780738d5a9d76 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Mon, 24 Jun 2024 10:16:24 +0200 Subject: [PATCH 13/15] install pysmiles from GH --- .github/workflows/python-app.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 93251fe..585b731 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -28,6 +28,7 @@ jobs: run: | pip install --upgrade setuptools pip pip install --upgrade . + pip install git+https://github.com/pckroon/pysmiles.git pip install -r requirements-tests.txt - name: Run pytest with codecoverage From c0a9ed88c01be8f879744735154fb94d0dbc25f8 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Mon, 24 Jun 2024 10:22:51 +0200 Subject: [PATCH 14/15] increase python version requirement to 3.8 --- .github/workflows/python-app.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 585b731..e2f4687 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -16,7 +16,7 @@ jobs: strategy: matrix: - py_version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] + py_version: ["3.8", "3.9", "3.10", "3.11", "3.12"] steps: - uses: actions/checkout@v2 From 70fec2171c52011991818b788bf815fc78f60402 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Wed, 3 Jul 2024 13:08:17 +0200 Subject: [PATCH 15/15] put change of supported python versions in setup.cfg --- setup.cfg | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.cfg b/setup.cfg index 27ba599..e40d15f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -17,10 +17,10 @@ classifier = License :: OSI Approved :: Apache Software License Operating System :: OS Independent Programming Language :: Python :: 3 - Programming Language :: Python :: 3.5 - Programming Language :: Python :: 3.6 - Programming Language :: Python :: 3.7 Programming Language :: Python :: 3.8 + Programming Language :: Python :: 3.9 + Programming Language :: Python :: 3.10 + Programming Language :: Python :: 3.11 Topic :: Scientific/Engineering :: Bio-Informatics Topic :: Scientific/Engineering :: Chemistry keywords = smiles bigsmiles coarse-grained graphs line notation