From 0c4da28636cdb60863b105362c25239debce8d84 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Fri, 12 Apr 2024 18:20:43 +0200 Subject: [PATCH 01/19] implement squash operator --- cgsmiles/resolve.py | 50 ++++++++++++++++++++++++- cgsmiles/tests/test_molecule_resolve.py | 12 +++++- 2 files changed, 58 insertions(+), 4 deletions(-) diff --git a/cgsmiles/resolve.py b/cgsmiles/resolve.py index fedc3c0..c1d1daa 100644 --- a/cgsmiles/resolve.py +++ b/cgsmiles/resolve.py @@ -141,7 +141,7 @@ def resolve_disconnected_molecule(self): new_node = correspondence[node] attrs = self.molecule.nodes[new_node] graph_frag.add_node(correspondence[node], **attrs) - nx.set_node_attributes(graph_frag, meta_node, 'fragid') + nx.set_node_attributes(graph_frag, [meta_node], 'fragid') for a, b in fragment.edges: new_a = correspondence[a] @@ -151,7 +151,6 @@ def resolve_disconnected_molecule(self): self.meta_graph.nodes[meta_node]['graph'] = graph_frag - def edges_from_bonding_descrpt(self): """ Make edges according to the bonding descriptors stored @@ -185,6 +184,52 @@ def edges_from_bonding_descrpt(self): order = 1 self.molecule.add_edge(edge[0], edge[1], bonding=bonding, order=order) + def squash_atoms(self): + """ + Applies the squash operator. + """ + bondings = nx.get_edge_attributes(self.molecule, 'bonding') + squashed = False + for edge, bonding in bondings.items(): + print(edge) + # we have a squash operator + if bonding[0].startswith('!'): + # find all hydrogens to remove + # and which atoms to connect + nodes_to_remove = [edge[1]] + new_edge_nodes = [] + for hnode in self.molecule.neighbors(edge[1]): + if self.molecule.nodes[hnode].get('element', 'Nan') == 'H': + nodes_to_remove.append(hnode) + elif hnode != edge[0]: + new_edge_nodes.append(hnode) + + # remove edges + self.molecule.remove_edge(*edge) + for node in nodes_to_remove[1:]: + self.molecule.remove_edge(edge[1], node) + + # add edges + for node in new_edge_nodes: + self.molecule.add_edge(edge[0], node) + + # find the reference hydrogen atoms + nodes_to_keep = [edge[0]] + for hnode in self.molecule.neighbors(edge[0]): + if self.molecule.nodes[hnode].get('element', 'Nan') == 'H': + nodes_to_keep.append(hnode) + + # remove squashed node and hydrogen atoms + for ref_node, node in zip(nodes_to_keep, nodes_to_remove): + other_fragid = self.molecule.nodes[node]['fragid'] + self.molecule.remove_node(node) + self.molecule.nodes[ref_node]['fragid'] += other_fragid + squashed = True + + if squashed: + self.meta_graph = annotate_fragments(self.meta_graph, + self.molecule) + def replace_unconsumed_bonding_descrpt(self): """ We allow multiple bonding descriptors per atom, which @@ -224,6 +269,7 @@ def resolve(self): self.resolve_disconnected_molecule() self.edges_from_bonding_descrpt() + self.squash_atoms() if self.all_atom: self.replace_unconsumed_bonding_descrpt() diff --git a/cgsmiles/tests/test_molecule_resolve.py b/cgsmiles/tests/test_molecule_resolve.py index 9a7ff66..930c459 100644 --- a/cgsmiles/tests/test_molecule_resolve.py +++ b/cgsmiles/tests/test_molecule_resolve.py @@ -107,10 +107,17 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes): # ('PE', 'C C H H H H H'), ('PE', 'C C H H H H'), ('PE', 'C C H H H'), # ('PE', 'C C H H H H'), ('PE', 'C C H H H H H'), ('PE', 'C C H H H H H')] # [, -# ( +# ( + # smiple squash operator; no unconsumed operators + ("{[#A][#B]}.{#A=OC[!],#B=[!]CC}", + # 0 1 2 3 4 1 5 3 4 6 7 8 + # 0 1 2 3 4 5 6 7 8 9 10 11 + [('A', 'O C H H H'), ('B', 'C C H H H H H'),], + [(0, 1), (0, 2), (1, 3), (1, 4), (1, 6), (6, 9), (6, 10), + (6, 11),]), )) -def test_def_big_smile_parser(smile, ref_nodes, ref_edges): +def test_resolve_molecule(smile, ref_nodes, ref_edges): meta_mol, molecule = MoleculeResolver(smile).resolve() # nx.draw_networkx(meta_mol.molecule, with_labels=True, labels=nx.get_node_attributes(meta_mol.molecule, 'element')) # plt.show() @@ -120,4 +127,5 @@ def test_def_big_smile_parser(smile, ref_nodes, ref_edges): elements = nx.get_node_attributes(block_graph, 'element') #.values()) sorted_elements = [elements[key] for key in sorted(elements)] assert sorted_elements == ref[1].split() + print(molecule.edges) assert sorted(molecule.edges) == sorted(ref_edges) From 4037b8ef83312892085c1b03e2e6a3beb35d43c5 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Fri, 12 Apr 2024 18:21:19 +0200 Subject: [PATCH 02/19] make fragid a list to allow a single atom to belong to more than one fragment --- cgsmiles/graph_utils.py | 18 ++++++++++-------- cgsmiles/read_fragments.py | 2 +- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/cgsmiles/graph_utils.py b/cgsmiles/graph_utils.py index b8d5038..56a4ee6 100644 --- a/cgsmiles/graph_utils.py +++ b/cgsmiles/graph_utils.py @@ -37,13 +37,13 @@ def merge_graphs(source_graph, target_graph, max_node=None): # We assume that the last id is always the largest. last_node_idx = max_node offset = last_node_idx - fragment_offset = source_graph.nodes[last_node_idx].get('fragid', 0) + fragment_offset = max(source_graph.nodes[last_node_idx].get('fragid', [0])) correspondence = {} for idx, node in enumerate(target_graph.nodes(), start=offset + 1): correspondence[node] = idx new_atom = copy.copy(target_graph.nodes[node]) - new_atom['fragid'] = (new_atom.get('fragid', 0) + fragment_offset) + new_atom['fragid'] = [(new_atom.get('fragid', 0) + fragment_offset)] source_graph.add_node(idx, **new_atom) for node1, node2 in target_graph.edges: @@ -95,23 +95,25 @@ def annotate_fragments(meta_graph, molecule): """ node_to_fragids = nx.get_node_attributes(molecule, 'fragid') - fragids = defaultdict(list) - for fragid, node in node_to_fragids.items(): - fragids[fragid].append(node) + fragid_to_node = defaultdict(list) + for node, fragids in node_to_fragids.items(): + print(fragids) + for fragid in fragids: + fragid_to_node[fragid].append(node) for meta_node in meta_graph.nodes: # adding node to the fragment graph graph_frag = nx.Graph() - for node in fragids[meta_node]: + for node in fragid_to_node[meta_node]: attrs = molecule.nodes[node] graph_frag.add_node(node, **attrs) # adding the edges # this is slow but OK; we always assume that the fragment # is much much smaller than the fullblown graph - combinations = itertools.combinations(fragids[meta_node], r=2) + combinations = itertools.combinations(fragid_to_node[meta_node], r=2) for a, b in combinations: if molecule.has_edge(a, b): - graph.add_edge(a, b) + graph_frag.add_edge(a, b) return meta_graph diff --git a/cgsmiles/read_fragments.py b/cgsmiles/read_fragments.py index d8740b0..a8c8957 100644 --- a/cgsmiles/read_fragments.py +++ b/cgsmiles/read_fragments.py @@ -35,7 +35,7 @@ def strip_bonding_descriptors(fragment_string): for token in smile_iter: if token == '[': peek = next(smile_iter) - if peek in ['$', '>', '<']: + if peek in ['$', '>', '<', '!']: bond_descrp = peek peek = next(smile_iter) while peek != ']': From 5973409c033e810b6e57fc637e6cc6d115b33e78 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Sun, 14 Apr 2024 14:18:45 +0200 Subject: [PATCH 03/19] implement squash operator --- cgsmiles/graph_utils.py | 1 - cgsmiles/resolve.py | 6 +++--- cgsmiles/tests/test_molecule_resolve.py | 15 ++++++++++++--- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/cgsmiles/graph_utils.py b/cgsmiles/graph_utils.py index 56a4ee6..88cd126 100644 --- a/cgsmiles/graph_utils.py +++ b/cgsmiles/graph_utils.py @@ -97,7 +97,6 @@ def annotate_fragments(meta_graph, molecule): fragid_to_node = defaultdict(list) for node, fragids in node_to_fragids.items(): - print(fragids) for fragid in fragids: fragid_to_node[fragid].append(node) diff --git a/cgsmiles/resolve.py b/cgsmiles/resolve.py index c1d1daa..27a8bbe 100644 --- a/cgsmiles/resolve.py +++ b/cgsmiles/resolve.py @@ -191,7 +191,6 @@ def squash_atoms(self): bondings = nx.get_edge_attributes(self.molecule, 'bonding') squashed = False for edge, bonding in bondings.items(): - print(edge) # we have a squash operator if bonding[0].startswith('!'): # find all hydrogens to remove @@ -248,12 +247,13 @@ def replace_unconsumed_bonding_descrpt(self): attrs = {attr: graph.nodes[node][attr] for attr in ['fragname', 'fragid']} attrs['element'] = 'H' for _ in range(0, hcount): - new_node = len(self.molecule.nodes) + 1 + new_node = len(self.molecule.nodes) #+ 1 graph.add_edge(node, new_node) attrs['atomname'] = "H" + str(len(graph.nodes)-1) graph.nodes[new_node].update(attrs) self.molecule.add_edge(node, new_node, order=1) self.molecule.nodes[new_node].update(attrs) + # now we want to sort the atoms sort_nodes(self.molecule) # and redo the meta molecule @@ -269,8 +269,8 @@ def resolve(self): self.resolve_disconnected_molecule() self.edges_from_bonding_descrpt() - self.squash_atoms() if self.all_atom: self.replace_unconsumed_bonding_descrpt() + self.squash_atoms() return self.meta_graph, self.molecule diff --git a/cgsmiles/tests/test_molecule_resolve.py b/cgsmiles/tests/test_molecule_resolve.py index 930c459..e8054d7 100644 --- a/cgsmiles/tests/test_molecule_resolve.py +++ b/cgsmiles/tests/test_molecule_resolve.py @@ -78,8 +78,8 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes): [('OHter', 'O H'), ('PEO', 'C O C H H H H'), ('PEO', 'C O C H H H H'), ('OHter', 'O H')], [(0, 1), (0, 2), (2, 3), (2, 5), (2, 10), (3, 4), - (4, 6), (4, 7), (4, 17), (8, 9), (8, 11), (8, 14), - (8, 18), (9, 10), (10, 12), (10, 13), (14, 15)]), + (4, 6), (4, 7), (4, 16), (8, 9), (8, 11), (8, 14), + (8, 17), (9, 10), (10, 12), (10, 13), (14, 15)]), # simple branched sequence ("{[#Hter][#PE]([#PEO][#Hter])[#PE]([#PEO][#Hter])[#Hter]}.{#Hter=[$]H,#PE=[$]CC[$][$],#PEO=[$]COC[$]}", [('Hter', 'H'), ('PE', 'C C H H H'), ('PEO', 'C O C H H H H'), ('Hter', 'H'), @@ -115,7 +115,16 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes): [('A', 'O C H H H'), ('B', 'C C H H H H H'),], [(0, 1), (0, 2), (1, 3), (1, 4), (1, 6), (6, 9), (6, 10), (6, 11),]), - + # smiple squash operator; unconsumed operators + ("{[#A][#B]}.{#A=OC[!],#B=[$][!]CC}", + # 0 1 2 3 4 1 5 3 4 6 7 8 + # 0 1 2 3 4 5 6 7 11 8 9 10 + # note that the unconsumed $ triggers rebuild of a hydrogen + # which however is appended to the end of the molecule so + # making it 11 + [('A', 'O C H H H'), ('B', 'C C H H H H H'),], + [(0, 1), (0, 2), (1, 3), (1, 4), (1, 6), (6, 8), (6, 9), + (6, 10),]), )) def test_resolve_molecule(smile, ref_nodes, ref_edges): meta_mol, molecule = MoleculeResolver(smile).resolve() From c53a451c7a3e30e4d9860adf609aef29079c2b4f Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Tue, 16 Apr 2024 11:35:11 +0200 Subject: [PATCH 04/19] implement new sorting function; attatch graph_frag as node attribute in annotate fragments --- cgsmiles/graph_utils.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/cgsmiles/graph_utils.py b/cgsmiles/graph_utils.py index 88cd126..3687f05 100644 --- a/cgsmiles/graph_utils.py +++ b/cgsmiles/graph_utils.py @@ -37,7 +37,7 @@ def merge_graphs(source_graph, target_graph, max_node=None): # We assume that the last id is always the largest. last_node_idx = max_node offset = last_node_idx - fragment_offset = max(source_graph.nodes[last_node_idx].get('fragid', [0])) + fragment_offset = max(source_graph.nodes[last_node_idx].get('fragid', [0])) + 1 correspondence = {} for idx, node in enumerate(target_graph.nodes(), start=offset + 1): @@ -53,31 +53,28 @@ def merge_graphs(source_graph, target_graph, max_node=None): return correspondence -def sort_nodes(graph, sortby_attrs=("fragid", "atomid"), target_attr=None): +def sort_nodes_by_attr(graph, sort_attr="fragid"): """ - Sort nodes in graph by multiple attributes. + Sort nodes in graph by attribute and relable the graph in place. Parameters ---------- graph: :class:`nx.Graph` the graph to sort nodes of - sortby_attrs: tuple(str) - the attributes and in which order to sort - target_attr: `abc.hashable` - if not None indices are assigned to this - attribute starting at 1 + sort_attr: `abc.hashable` + the attribute to use for sorting Returns ------- nx.Graph graph with nodes sorted in correct order """ - node_order = sorted(graph, key=partial(_keyfunc, graph, attrs=sortby_attrs)) - for new_idx, node_key in enumerate(node_order, 1): -# graph._node.move_to_end(node_key) - if target_attr is not None: - graph.nodes[node_key][target_attr] = new_idx - return graph + fragids = nx.get_node_attributes(graph, "fragid") + sorted_ids = sorted(fragids.items(), key=lambda item: (item[1], item[0])) + print(sorted_ids) + mapping = {old[0]: new for new, old in enumerate(sorted_ids)} + new_graph = nx.relabel_nodes(graph, mapping, copy=True) + return new_graph def _keyfunc(graph, node_idx, attrs): """ @@ -115,4 +112,6 @@ def annotate_fragments(meta_graph, molecule): if molecule.has_edge(a, b): graph_frag.add_edge(a, b) + meta_graph.nodes[meta_node]['graph'] = graph_frag + return meta_graph From cdbc171e7138c8252d87da5471712d0d011df9b8 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Tue, 16 Apr 2024 11:36:35 +0200 Subject: [PATCH 05/19] update tests with new ordering; assert equivl. based on graph isomorphism --- cgsmiles/tests/test_molecule_resolve.py | 67 +++++++++++++++++-------- 1 file changed, 46 insertions(+), 21 deletions(-) diff --git a/cgsmiles/tests/test_molecule_resolve.py b/cgsmiles/tests/test_molecule_resolve.py index e8054d7..fbf5e5a 100644 --- a/cgsmiles/tests/test_molecule_resolve.py +++ b/cgsmiles/tests/test_molecule_resolve.py @@ -42,13 +42,14 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes): assert new_btypes == btypes -@pytest.mark.parametrize('smile, ref_nodes, ref_edges',( +@pytest.mark.parametrize('smile, ref_frags, elements, ref_edges',( # smiple linear seqeunce ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]COC[$],#OHter=[$][O]}", # 0 1 2 3 4 5 6 7 8 [('OHter', 'O H'), ('PEO', 'C O C H H H H'), # 9 10 11 12 13 14 15 16 17 ('PEO', 'C O C H H H H'), ('OHter', 'O H')], + 'O H C O C H H H H C O C H H H H O H', [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7), (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13), (11, 14), (11, 15), (11, 16), (16, 17)]), @@ -58,6 +59,7 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes): [('OHter', 'O H'), ('PEO', 'C O C H H H H'), # 9 10 11 12 13 14 15 16 17 ('PEO', 'C O C H H H H'), ('OHter', 'O H')], + 'O H C O C H H H H C O C H H H H O H', [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7), (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13), (11, 14), (11, 15), (11, 16), (16, 17)]), @@ -67,23 +69,27 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes): [('OHter', 'O Na'), ('PEO', 'C O C H H H H'), # 9 10 11 12 13 14 15 16 17 ('PEO', 'C O C H H H H'), ('OHter', 'O Na')], + 'O Na C O C H H H H C O C H H H H O Na', [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7), (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13), (11, 14), (11, 15), (11, 16), (16, 17)]), - # uncomsumed bonding IDs; note that this is not the same # molecule as previous test case. Here one of the OH branches # and replaces an CH2 group with CH-OH - ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[>][$1]COC[<],#OHter=[$1][O]}", + ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[>][$1A]COC[<],#OHter=[$1A][O]}", + # 0 1 2 3 4 5 6 7 8 [('OHter', 'O H'), ('PEO', 'C O C H H H H'), + # 9 10 11 12 13 14 15 16 17 ('PEO', 'C O C H H H H'), ('OHter', 'O H')], - [(0, 1), (0, 2), (2, 3), (2, 5), (2, 10), (3, 4), - (4, 6), (4, 7), (4, 16), (8, 9), (8, 11), (8, 14), - (8, 17), (9, 10), (10, 12), (10, 13), (14, 15)]), + 'O H C O C H H H H C O C H H H H O H', + [(0, 1), (0, 2), (2, 3), (2, 5), (2, 9), (3, 4), + (4, 6), (4, 7), (4, 8), (9, 10), (9, 12), (9, 13), + (10, 11), (11, 15), (11, 14), (11, 16), (16, 17)]), # simple branched sequence ("{[#Hter][#PE]([#PEO][#Hter])[#PE]([#PEO][#Hter])[#Hter]}.{#Hter=[$]H,#PE=[$]CC[$][$],#PEO=[$]COC[$]}", [('Hter', 'H'), ('PE', 'C C H H H'), ('PEO', 'C O C H H H H'), ('Hter', 'H'), ('PE', 'C C H H H'), ('PEO', 'C O C H H H H'), ('Hter', 'H'), ('Hter', 'H')], + 'H C C H H H C O C H H H H H C C H H H C O C H H H H H H', [(0, 1), (1, 2), (1, 3), (1, 4), (2, 5), (2, 6), (2, 14), (6, 7), (6, 9), (6, 10), (7, 8), (8, 11), (8, 12), (8, 13), (14, 15), (14, 16), (14, 17), (15, 18), (15, 19), (15, 27), (19, 20), (19, 22), (19, 23), (20, 21), (21, 24), (21, 25), (21, 26)]), @@ -93,6 +99,7 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes): ("{[#Hter][#PS]|2[#Hter]}.{#PS=[$]CC[$]c1ccccc1,#Hter=[$]H}", [('Hter', 'H'), ('PS', 'C C C C C C C C H H H H H H H H'), ('PS', 'C C C C C C C C H H H H H H H H'), ('Hter', 'H')], + 'H C C C C C C C C H H H H H H H H C C C C C C C C H H H H H H H H H', [(0, 1), (1, 2), (1, 9), (1, 10), (2, 3), (2, 11), (2, 17), (3, 4), (3, 8), (4, 5), (4, 12), (5, 6), (5, 13), (6, 7), (6, 14), (7, 8), (7, 15), (8, 16), (17, 18), (17, 25), @@ -111,10 +118,16 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes): # smiple squash operator; no unconsumed operators ("{[#A][#B]}.{#A=OC[!],#B=[!]CC}", # 0 1 2 3 4 1 5 3 4 6 7 8 - # 0 1 2 3 4 5 6 7 8 9 10 11 - [('A', 'O C H H H'), ('B', 'C C H H H H H'),], - [(0, 1), (0, 2), (1, 3), (1, 4), (1, 6), (6, 9), (6, 10), - (6, 11),]), + # note that the order here is O H then C H H + # because the C H H is shared between fragments + # so that sorting by fragment id yields this order + # the same goes for the second fragment but here + # the CH2 goes in the front because it also belongs + # to the fragment with lower fragid + [('A', 'O H C H H'), ('B', 'C H H C H H H'),], + 'O H C H H C H H H', + [(0, 1), (0, 2), (2, 3), (2, 4), (2, 5), + (5, 6), (5, 7), (5, 8)]), # smiple squash operator; unconsumed operators ("{[#A][#B]}.{#A=OC[!],#B=[$][!]CC}", # 0 1 2 3 4 1 5 3 4 6 7 8 @@ -122,19 +135,31 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes): # note that the unconsumed $ triggers rebuild of a hydrogen # which however is appended to the end of the molecule so # making it 11 - [('A', 'O C H H H'), ('B', 'C C H H H H H'),], - [(0, 1), (0, 2), (1, 3), (1, 4), (1, 6), (6, 8), (6, 9), - (6, 10),]), + [('A', 'O H C H H'), ('B', 'C H H C H H H'),], + 'O H C H H C H H H', + [(0, 1), (0, 2), (2, 3), (2, 4), (2, 5), + (5, 6), (5, 7), (5, 8)]), )) -def test_resolve_molecule(smile, ref_nodes, ref_edges): +def test_all_atom_resolve_molecule(smile, ref_frags, elements, ref_edges): meta_mol, molecule = MoleculeResolver(smile).resolve() -# nx.draw_networkx(meta_mol.molecule, with_labels=True, labels=nx.get_node_attributes(meta_mol.molecule, 'element')) -# plt.show() - for node, ref in zip(meta_mol.nodes, ref_nodes): + + # loop and compare fragments first + for node, ref in zip(meta_mol.nodes, ref_frags): assert meta_mol.nodes[node]['fragname'] == ref[0] block_graph = meta_mol.nodes[node]['graph'] - elements = nx.get_node_attributes(block_graph, 'element') #.values()) - sorted_elements = [elements[key] for key in sorted(elements)] + target_elements = nx.get_node_attributes(block_graph, 'element') + sorted_elements = [target_elements[key] for key in sorted(target_elements)] assert sorted_elements == ref[1].split() - print(molecule.edges) - assert sorted(molecule.edges) == sorted(ref_edges) + + # make the full scale reference graph + ref_graph = nx.Graph() + ref_graph.add_edges_from(ref_edges) + nx.set_node_attributes(ref_graph, + dict(zip(sorted(ref_graph.nodes), elements.split())), + 'element') + + def _ele_match(n1, n2): + return n1["element"] == n2["element"] + + # check that reference graph and molecule are isomorphic + assert nx.is_isomorphic(ref_graph, molecule, node_match=_ele_match) From fe056ac4ab2f16662c6a1f1e3ae61bea83114177 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Tue, 16 Apr 2024 11:38:38 +0200 Subject: [PATCH 06/19] remove rebuilding of hydrogen atoms from fragment reader --- cgsmiles/read_fragments.py | 35 ----------------------------------- 1 file changed, 35 deletions(-) diff --git a/cgsmiles/read_fragments.py b/cgsmiles/read_fragments.py index a8c8957..e191a85 100644 --- a/cgsmiles/read_fragments.py +++ b/cgsmiles/read_fragments.py @@ -67,39 +67,6 @@ def strip_bonding_descriptors(fragment_string): smile += token return smile, bonding_descrpt -def _rebuild_h_atoms(mol_graph): - # special hack around to fix - # pysmiles bug for a single - # atom molecule; we assume that the - # hcount is just wrong and set it to - # the valance number minus bonds minus - # bonding connectors - if len(mol_graph.nodes) == 1: - ele = mol_graph.nodes[0]['element'] - # for N and P we assume the regular valency - hcount = pysmiles.smiles_helper.VALENCES[ele][0] - if mol_graph.nodes[0].get('bonding', False): - hcount -= 1 - mol_graph.nodes[0]['hcount'] = hcount - else: - for node in mol_graph.nodes: - if mol_graph.nodes[node].get('bonding', False): - # get the degree - ele = mol_graph.nodes[node]['element'] - # hcount is the valance minus the degree minus - # the number of bonding descriptors - bonds = round(sum([mol_graph.edges[(node, neigh)]['order'] for neigh in\ - mol_graph.neighbors(node)])) - charge = mol_graph.nodes[node].get('charge', 0) - hcount = pysmiles.smiles_helper.VALENCES[ele][0] -\ - bonds -\ - len(mol_graph.nodes[node]['bonding']) +\ - charge - mol_graph.nodes[node]['hcount'] = hcount - - pysmiles.smiles_helper.add_explicit_hydrogens(mol_graph) - return mol_graph - def fragment_iter(fragment_str, all_atom=True): """ Iterates over fragments defined in a CGBigSmile string. @@ -137,8 +104,6 @@ def fragment_iter(fragment_str, all_atom=True): elif all_atom: mol_graph = pysmiles.read_smiles(smile) nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding') - # we need to rebuild hydrogen atoms now - _rebuild_h_atoms(mol_graph) # we deal with a CG resolution graph else: mol_graph = read_cgsmiles(smile) From 1e3973ded3b8e5e30511d5803c843047ff99e31d Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Tue, 16 Apr 2024 11:53:16 +0200 Subject: [PATCH 07/19] move rebuilding of hydrogen atoms to pysmiles utils and do after connecting molecule --- cgsmiles/pysmiles_utils.py | 58 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 cgsmiles/pysmiles_utils.py diff --git a/cgsmiles/pysmiles_utils.py b/cgsmiles/pysmiles_utils.py new file mode 100644 index 0000000..6ca32e3 --- /dev/null +++ b/cgsmiles/pysmiles_utils.py @@ -0,0 +1,58 @@ +import pysmiles + +VALENCES = pysmiles.smiles_helper.VALENCES +VALENCES.update({"H": (1,)}) + +def rebuild_h_atoms(mol_graph, keep_bonding=False): + """ + Helper function which add hydrogen atoms to the molecule graph. + + First the hcount attribute produced by pysmiles us updated, because + fragments have no bonds at time of reading so pysmiles does not + know the connectivity. Hence the hcount is redone based on the + actual connectivity of the final molecule. + + This function also makes sure to tag single hydrogen fragments, + in order to not merge them with adjecent fragments. Otherwise, + explicit single hydrogen residues would not be possible. + + The molecule graph is updated in place with the hydrogen atoms + that are missing. + + Using the keep_bonding argument the hydrogen count is reduced + by the number of bonding descriptors. In this way hydrogen + atoms can also be added to fragments only. + + Parameters + ---------- + mol_graph: :class:`nx.Graph` + graph describing the full molecule without hydrogen atoms + """ + for node in mol_graph.nodes: + if mol_graph.nodes[node].get('bonding', False): + # get the degree + ele = mol_graph.nodes[node]['element'] + # hcount is the valance minus the degree minus + # the number of bonding descriptors + bonds = round(sum([mol_graph.edges[(node, neigh)]['order'] for neigh in\ + mol_graph.neighbors(node)])) + charge = mol_graph.nodes[node].get('charge', 0) + hcount = pysmiles.smiles_helper.VALENCES[ele][0] -\ + bonds +\ + charge + # in this case we only rebuild hydrogen atoms that are not + # replaced by bonding operators. + if keep_bonding: + hcount -= len(mol_graph.nodes[node]['bonding']) + + mol_graph.nodes[node]['hcount'] = hcount + if ele == "H": + mol_graph.nodes[node]['single_h_frag'] = True + + pysmiles.smiles_helper.add_explicit_hydrogens(mol_graph) + for node in mol_graph.nodes: + if mol_graph.nodes[node].get("element", "*") == "H" and\ + not mol_graph.nodes[node].get("single_h_frag", False): + ref_node = next(mol_graph.neighbors(node)) + mol_graph.nodes[node]["fragid"] = mol_graph.nodes[ref_node]["fragid"] + mol_graph.nodes[node]["fragname"] = mol_graph.nodes[ref_node]["fragname"] From 183baaa7384df90ae90ac094195aa33cef26e3b9 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Tue, 16 Apr 2024 11:53:52 +0200 Subject: [PATCH 08/19] clean up resolve part I --- cgsmiles/resolve.py | 64 +++++++++++++++------------------------------ 1 file changed, 21 insertions(+), 43 deletions(-) diff --git a/cgsmiles/resolve.py b/cgsmiles/resolve.py index 27a8bbe..ed96222 100644 --- a/cgsmiles/resolve.py +++ b/cgsmiles/resolve.py @@ -3,10 +3,8 @@ import pysmiles from .read_cgsmiles import read_cgsmiles from .read_fragments import read_fragments -from .graph_utils import merge_graphs, sort_nodes, annotate_fragments - -VALENCES = pysmiles.smiles_helper.VALENCES -VALENCES.update({"H": (1,)}) +from .graph_utils import merge_graphs, sort_nodes_by_attr, annotate_fragments +from .pysmiles_utils import rebuild_h_atoms def compatible(left, right): """ @@ -146,8 +144,10 @@ def resolve_disconnected_molecule(self): for a, b in fragment.edges: new_a = correspondence[a] new_b = correspondence[b] + attrs = fragment.edges[(a, b)] graph_frag.add_edge(new_a, - new_b) + new_b, + **attrs) self.meta_graph.nodes[meta_node]['graph'] = graph_frag @@ -210,7 +210,10 @@ def squash_atoms(self): # add edges for node in new_edge_nodes: - self.molecule.add_edge(edge[0], node) + order = re.findall("\d+\.\d+", bonding[0]) + if not order: + order = 1 + self.molecule.add_edge(edge[0], node, order=order) # find the reference hydrogen atoms nodes_to_keep = [edge[0]] @@ -223,43 +226,9 @@ def squash_atoms(self): other_fragid = self.molecule.nodes[node]['fragid'] self.molecule.remove_node(node) self.molecule.nodes[ref_node]['fragid'] += other_fragid - squashed = True - - if squashed: - self.meta_graph = annotate_fragments(self.meta_graph, - self.molecule) - - def replace_unconsumed_bonding_descrpt(self): - """ - We allow multiple bonding descriptors per atom, which - however, are not always consumed. In this case the left - over bonding descriptors are replaced by hydrogen atoms. - """ - for meta_node in self.meta_graph.nodes: - graph = self.meta_graph.nodes[meta_node]['graph'] - bonding = nx.get_node_attributes(graph, "bonding") - for node, bondings in bonding.items(): - if bondings: - element = graph.nodes[node]['element'] - bonds = round(sum([self.molecule.edges[(node, neigh)]['order'] for neigh in\ - self.molecule.neighbors(node)])) - hcount = VALENCES[element][0] - bonds - attrs = {attr: graph.nodes[node][attr] for attr in ['fragname', 'fragid']} - attrs['element'] = 'H' - for _ in range(0, hcount): - new_node = len(self.molecule.nodes) #+ 1 - graph.add_edge(node, new_node) - attrs['atomname'] = "H" + str(len(graph.nodes)-1) - graph.nodes[new_node].update(attrs) - self.molecule.add_edge(node, new_node, order=1) - self.molecule.nodes[new_node].update(attrs) - - # now we want to sort the atoms - sort_nodes(self.molecule) - # and redo the meta molecule - self.meta_graph = annotate_fragments(self.meta_graph, self.molecule) def resolve(self): + if self.cgsmiles_string is not None: self.meta_graph = read_cgsmiles(self.cgsmiles_string) @@ -268,9 +237,18 @@ def resolve(self): all_atom=self.all_atom)) self.resolve_disconnected_molecule() + self.edges_from_bonding_descrpt() - if self.all_atom: - self.replace_unconsumed_bonding_descrpt() self.squash_atoms() + + if self.all_atom: + rebuild_h_atoms(self.molecule) + + # now we want to sort the atoms + self.molecule = sort_nodes_by_attr(self.molecule, sort_attr=("fragid")) + # and redo the meta molecule + self.meta_graph = annotate_fragments(self.meta_graph, + self.molecule) + return self.meta_graph, self.molecule From 4a7c7d70cc5b7458f12bcda32b45b82f7e977393 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Tue, 16 Apr 2024 12:12:36 +0200 Subject: [PATCH 09/19] clean up squash function --- cgsmiles/resolve.py | 44 +++++++++++++++++++------------------------- 1 file changed, 19 insertions(+), 25 deletions(-) diff --git a/cgsmiles/resolve.py b/cgsmiles/resolve.py index ed96222..61f2fb4 100644 --- a/cgsmiles/resolve.py +++ b/cgsmiles/resolve.py @@ -193,39 +193,32 @@ def squash_atoms(self): for edge, bonding in bondings.items(): # we have a squash operator if bonding[0].startswith('!'): - # find all hydrogens to remove - # and which atoms to connect - nodes_to_remove = [edge[1]] + node_to_keep, node_to_remove = edge + # find all connections of discarded node + # so we can add them to the remaining node new_edge_nodes = [] - for hnode in self.molecule.neighbors(edge[1]): - if self.molecule.nodes[hnode].get('element', 'Nan') == 'H': - nodes_to_remove.append(hnode) - elif hnode != edge[0]: - new_edge_nodes.append(hnode) + for neigh_node in self.molecule.neighbors(edge[1]): + if neigh_node != edge[0]: + new_edge_nodes.append(neigh_node) - # remove edges - self.molecule.remove_edge(*edge) - for node in nodes_to_remove[1:]: - self.molecule.remove_edge(edge[1], node) - - # add edges + # add new edges between the old and new node for node in new_edge_nodes: order = re.findall("\d+\.\d+", bonding[0]) + if not order: order = 1 - self.molecule.add_edge(edge[0], node, order=order) - # find the reference hydrogen atoms - nodes_to_keep = [edge[0]] - for hnode in self.molecule.neighbors(edge[0]): - if self.molecule.nodes[hnode].get('element', 'Nan') == 'H': - nodes_to_keep.append(hnode) + self.molecule.add_edge(node_to_keep, + node, + order=order) - # remove squashed node and hydrogen atoms - for ref_node, node in zip(nodes_to_keep, nodes_to_remove): - other_fragid = self.molecule.nodes[node]['fragid'] - self.molecule.remove_node(node) - self.molecule.nodes[ref_node]['fragid'] += other_fragid + # add the fragment id of the sequashed node + self.molecule.nodes[node_to_keep]['fragid'] +=\ + self.molecule.nodes[node_to_remove]['fragid'] + + # remove the edge and node + self.molecule.remove_edge(*edge) + self.molecule.remove_node(node_to_remove) def resolve(self): @@ -247,6 +240,7 @@ def resolve(self): # now we want to sort the atoms self.molecule = sort_nodes_by_attr(self.molecule, sort_attr=("fragid")) + # and redo the meta molecule self.meta_graph = annotate_fragments(self.meta_graph, self.molecule) From dff4d4ac3aabcd7ecfb0728798bb67e6091b5c0f Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Tue, 16 Apr 2024 12:17:57 +0200 Subject: [PATCH 10/19] clean up squash function --- cgsmiles/resolve.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cgsmiles/resolve.py b/cgsmiles/resolve.py index 61f2fb4..0b199bb 100644 --- a/cgsmiles/resolve.py +++ b/cgsmiles/resolve.py @@ -186,7 +186,10 @@ def edges_from_bonding_descrpt(self): def squash_atoms(self): """ - Applies the squash operator. + Applies the squash operator by removing the duplicate node + adding, all edges from that node to the remaining one, and + annotate the other node with the fragid of the removed + node. """ bondings = nx.get_edge_attributes(self.molecule, 'bonding') squashed = False From 7296746fac889f5b80464a78aea15d2178a88f26 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Tue, 16 Apr 2024 13:24:12 +0200 Subject: [PATCH 11/19] adjust test --- cgsmiles/tests/test_cgsmile_parsing.py | 46 +++++++------------------- 1 file changed, 12 insertions(+), 34 deletions(-) diff --git a/cgsmiles/tests/test_cgsmile_parsing.py b/cgsmiles/tests/test_cgsmile_parsing.py index a06dd6b..b77b287 100644 --- a/cgsmiles/tests/test_cgsmile_parsing.py +++ b/cgsmiles/tests/test_cgsmile_parsing.py @@ -151,72 +151,50 @@ def test_strip_bonding_descriptors(big_smile, smile, bonding): {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$"], "element": "C"}), (1, {"atomname": "O1", "fragname": "PEO", "element": "O"}), (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$"], "element": "C"}), - (3, {"atomname": "H3", "fragname": "PEO", "element": "H"}), - (4, {"atomname": "H4", "fragname": "PEO", "element": "H"}), - (5, {"atomname": "H5", "fragname": "PEO", "element": "H"}), - (6, {"atomname": "H6", "fragname": "PEO", "element": "H"}), )}, - {"PEO": [(0, 1), (1, 2), (0, 3), (0, 4), (2, 5), (2, 6)]}), + {"PEO": [(0, 1), (1, 2)]}), # single fragment but with explicit hydrogen in smiles ("{#PEO=[$][CH2]O[CH2][$]}", {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$"], "element": "C"}), (1, {"atomname": "O1", "fragname": "PEO", "element": "O"}), (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$"], "element": "C"}), - (3, {"atomname": "H3", "fragname": "PEO", "element": "H"}), - (4, {"atomname": "H4", "fragname": "PEO", "element": "H"}), - (5, {"atomname": "H5", "fragname": "PEO", "element": "H"}), - (6, {"atomname": "H6", "fragname": "PEO", "element": "H"}), )}, - {"PEO": [(0, 1), (1, 2), (0, 3), (0, 4), (2, 5), (2, 6)]}), + {"PEO": [(0, 1), (1, 2)]}), # test NH3 terminal ("{#AMM=N[$]}", {"AMM": ((0, {"atomname": "N0", "fragname": "AMM", "bonding": ["$"], "element": "N"}), - (1, {"atomname": "H1", "fragname": "AMM", "element": "H"}), - (2, {"atomname": "H2", "fragname": "AMM", "element": "H"}), )}, - {"AMM": [(0, 1), (0, 2)]}), + {"AMM": []}), # single fragment + 1 terminal (i.e. only 1 bonding descrpt ("{#PEO=[$]COC[$],#OHter=[$][OH]}", {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$"], "element": "C"}), (1, {"atomname": "O1", "fragname": "PEO", "element": "O"}), (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$"], "element": "C"}), - (3, {"atomname": "H3", "fragname": "PEO", "element": "H"}), - (4, {"atomname": "H4", "fragname": "PEO", "element": "H"}), - (5, {"atomname": "H5", "fragname": "PEO", "element": "H"}), - (6, {"atomname": "H6", "fragname": "PEO", "element": "H"}), ), - "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$"], "element": "O"}), - (1, {"atomname": "H1", "fragname": "OHter", "element": "H"}))}, - {"PEO": [(0, 1), (1, 2), (0, 3), (0, 4), (2, 5), (2, 6)], - "OHter": [(0, 1)]}), + "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$"], "element": "O"}),)}, + {"PEO": [(0, 1), (1, 2)], + "OHter": []}), # single fragment + 1 terminal but multiple bond descritp. # this adjust the hydrogen count ("{#PEO=[$]COC[$][$1],#OHter=[$][OH]}", {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$"], "element": "C"}), (1, {"atomname": "O1", "fragname": "PEO", "element": "O"}), (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$", "$1"], "element": "C"}), - (3, {"atomname": "H3", "fragname": "PEO", "element": "H"}), - (4, {"atomname": "H4", "fragname": "PEO", "element": "H"}), - (5, {"atomname": "H5", "fragname": "PEO", "element": "H"}), ), - "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$"], "element": "O"}), - (1, {"atomname": "H1", "fragname": "OHter", "element": "H"}))}, - {"PEO": [(0, 1), (1, 2), (0, 3), (0, 4), (2, 5)], - "OHter": [(0, 1)]}), + "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$"], "element": "O"}),)}, + {"PEO": [(0, 1), (1, 2)], + "OHter": []}), # single fragment + 1 terminal but multiple bond descritp. # but explicit hydrogen in the smiles string ("{#PEO=[$][CH2]O[CH2][$][$1],#OHter=[$][OH]}", {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$"], "element": "C"}), (1, {"atomname": "O1", "fragname": "PEO", "element": "O"}), (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$", "$1"], "element": "C"}), - (3, {"atomname": "H3", "fragname": "PEO", "element": "H"}), - (4, {"atomname": "H4", "fragname": "PEO", "element": "H"}), - (5, {"atomname": "H5", "fragname": "PEO", "element": "H"}), ), "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$"], "element": "O"}), - (1, {"atomname": "H1", "fragname": "OHter", "element": "H"}))}, - {"PEO": [(0, 1), (1, 2), (0, 3), (0, 4), (2, 5)], - "OHter": [(0, 1)]}), + )}, + {"PEO": [(0, 1), (1, 2),], + "OHter": []}), )) def test_fragment_iter(fragment_str, nodes, edges): From 555841553eff188ad356ec4f4e4cabb029d9c12f Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Tue, 16 Apr 2024 15:55:30 +0200 Subject: [PATCH 12/19] add test where squash and conect are used on same atom --- cgsmiles/tests/test_molecule_resolve.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/cgsmiles/tests/test_molecule_resolve.py b/cgsmiles/tests/test_molecule_resolve.py index fbf5e5a..43b5b8b 100644 --- a/cgsmiles/tests/test_molecule_resolve.py +++ b/cgsmiles/tests/test_molecule_resolve.py @@ -139,6 +139,18 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes): 'O H C H H C H H H', [(0, 1), (0, 2), (2, 3), (2, 4), (2, 5), (5, 6), (5, 7), (5, 8)]), + # smiple squash operator; plus connect operator + ("{[#A][#B][#C]}.{#A=OC[!],#B=[$][!]CC,#C=[$]O}", + # 0 1 2 3 4 1 5 3 4 6 7 8 + # 0 1 2 3 4 5 6 7 11 8 9 10 + # note that the unconsumed $ triggers rebuild of a hydrogen + # which however is appended to the end of the molecule so + # making it 11 + [('A', 'O H C H'), ('B', 'C H C H H H'), ('C', 'O H')], + 'O H C H C H H H O H', + [(0, 1), (0, 2), (2, 3), (2, 4), + (4, 5), (4, 6), (4, 7), (2, 8), (8, 9)]), + )) def test_all_atom_resolve_molecule(smile, ref_frags, elements, ref_edges): meta_mol, molecule = MoleculeResolver(smile).resolve() From 199d3e4bbc1c2d1e9d59db1972a59a0561781564 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Tue, 16 Apr 2024 16:04:21 +0200 Subject: [PATCH 13/19] make squash function even easier --- cgsmiles/resolve.py | 40 ++++++++++++---------------------------- 1 file changed, 12 insertions(+), 28 deletions(-) diff --git a/cgsmiles/resolve.py b/cgsmiles/resolve.py index 0b199bb..48e9078 100644 --- a/cgsmiles/resolve.py +++ b/cgsmiles/resolve.py @@ -194,34 +194,18 @@ def squash_atoms(self): bondings = nx.get_edge_attributes(self.molecule, 'bonding') squashed = False for edge, bonding in bondings.items(): - # we have a squash operator - if bonding[0].startswith('!'): - node_to_keep, node_to_remove = edge - # find all connections of discarded node - # so we can add them to the remaining node - new_edge_nodes = [] - for neigh_node in self.molecule.neighbors(edge[1]): - if neigh_node != edge[0]: - new_edge_nodes.append(neigh_node) - - # add new edges between the old and new node - for node in new_edge_nodes: - order = re.findall("\d+\.\d+", bonding[0]) - - if not order: - order = 1 - - self.molecule.add_edge(node_to_keep, - node, - order=order) - - # add the fragment id of the sequashed node - self.molecule.nodes[node_to_keep]['fragid'] +=\ - self.molecule.nodes[node_to_remove]['fragid'] - - # remove the edge and node - self.molecule.remove_edge(*edge) - self.molecule.remove_node(node_to_remove) + if not bonding[0].startswith('!'): + continue + # let's squash two nodes + node_to_keep, node_to_remove = edge + self.molecule = nx.contracted_nodes(self.molecule, + node_to_keep, + node_to_remove, + self_loops=False) + + # add the fragment id of the sequashed node + self.molecule.nodes[node_to_keep]['fragid'] +=\ + self.molecule.nodes[node_to_keep]['contraction'][node_to_remove]['fragid'] def resolve(self): From cdf3d0055fc99045b003c5fa3aed0030655d9a1d Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Tue, 16 Apr 2024 16:11:23 +0200 Subject: [PATCH 14/19] improve bonding function by making deepcopy of bonding list in first place --- cgsmiles/resolve.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/cgsmiles/resolve.py b/cgsmiles/resolve.py index 48e9078..7c6aec5 100644 --- a/cgsmiles/resolve.py +++ b/cgsmiles/resolve.py @@ -1,4 +1,5 @@ import re +import copy import networkx as nx import pysmiles from .read_cgsmiles import read_cgsmiles @@ -127,6 +128,10 @@ def __init__(self, def resolve_disconnected_molecule(self): """ + Given a connected graph of nodes with associated fragment graphs + generate a disconnected graph of the fragments and annotate + each fragment graph to the node in the higher resolution + graph. """ for meta_node in self.meta_graph.nodes: fragname = self.meta_graph.nodes[meta_node]['fragname'] @@ -137,14 +142,14 @@ def resolve_disconnected_molecule(self): for node in fragment.nodes: new_node = correspondence[node] - attrs = self.molecule.nodes[new_node] + attrs = copy.deepcopy(self.molecule.nodes[new_node]) graph_frag.add_node(correspondence[node], **attrs) nx.set_node_attributes(graph_frag, [meta_node], 'fragid') for a, b in fragment.edges: new_a = correspondence[a] new_b = correspondence[b] - attrs = fragment.edges[(a, b)] + attrs = copy.deepcopy(fragment.edges[(a, b)]) graph_frag.add_edge(new_a, new_b, **attrs) @@ -166,20 +171,13 @@ def edges_from_bonding_descrpt(self): edge, bonding = generate_edge(prev_graph, node_graph) - #To DO - clean copying of bond-list attribute - # this is a bit of a workaround because at this stage the - # bonding list is actually shared between all residues of - # of the same type; so we first make a copy then we replace - # the list sans used bonding descriptor - prev_bond_list = prev_graph.nodes[edge[0]]['bonding'].copy() - prev_bond_list.remove(bonding[0]) - prev_graph.nodes[edge[0]]['bonding'] = prev_bond_list - node_bond_list = node_graph.nodes[edge[1]]['bonding'].copy() - node_bond_list.remove(bonding[1]) - node_graph.nodes[edge[1]]['bonding'] = node_bond_list - order = re.findall("\d+\.\d+", bonding[0]) + # remove used bonding descriptors + prev_graph.nodes[edge[0]]['bonding'].remove(bonding[0]) + node_graph.nodes[edge[1]]['bonding'].remove(bonding[1]) + # bonding descriptors are assumed to have bonding order 1 # unless they are specifically annotated + order = re.findall("\d+\.\d+", bonding[0]) if not order: order = 1 self.molecule.add_edge(edge[0], edge[1], bonding=bonding, order=order) @@ -216,16 +214,20 @@ def resolve(self): self.fragment_dict.update(read_fragments(self.fragment_string, all_atom=self.all_atom)) + # add disconnected fragments to graph self.resolve_disconnected_molecule() + # connect valid bonding descriptors self.edges_from_bonding_descrpt() + # contract atoms with squash descriptors self.squash_atoms() + # rebuild hydrogen in all-atom case if self.all_atom: rebuild_h_atoms(self.molecule) - # now we want to sort the atoms + # sort the atoms self.molecule = sort_nodes_by_attr(self.molecule, sort_attr=("fragid")) # and redo the meta molecule From 297130d5467a5e43ea635897c2323e82e757d9bd Mon Sep 17 00:00:00 2001 From: "Dr. Fabian Grunewald" <32294573+fgrunewald@users.noreply.github.com> Date: Thu, 2 May 2024 10:38:17 +0200 Subject: [PATCH 15/19] Apply suggestions from code review add peters suggestions Co-authored-by: Peter C Kroon --- cgsmiles/graph_utils.py | 7 +++---- cgsmiles/pysmiles_utils.py | 2 +- cgsmiles/resolve.py | 3 +-- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/cgsmiles/graph_utils.py b/cgsmiles/graph_utils.py index 3687f05..1bbd88c 100644 --- a/cgsmiles/graph_utils.py +++ b/cgsmiles/graph_utils.py @@ -69,10 +69,9 @@ def sort_nodes_by_attr(graph, sort_attr="fragid"): nx.Graph graph with nodes sorted in correct order """ - fragids = nx.get_node_attributes(graph, "fragid") - sorted_ids = sorted(fragids.items(), key=lambda item: (item[1], item[0])) - print(sorted_ids) - mapping = {old[0]: new for new, old in enumerate(sorted_ids)} + attr_values = nx.get_node_attributes(graph, sort_attr) + sorted_ids = sorted(attr_values, key=lambda item: (attr_values[item], item)) + mapping = {old: new for new, old in enumerate(sorted_ids)} new_graph = nx.relabel_nodes(graph, mapping, copy=True) return new_graph diff --git a/cgsmiles/pysmiles_utils.py b/cgsmiles/pysmiles_utils.py index 6ca32e3..677a7bb 100644 --- a/cgsmiles/pysmiles_utils.py +++ b/cgsmiles/pysmiles_utils.py @@ -7,7 +7,7 @@ def rebuild_h_atoms(mol_graph, keep_bonding=False): """ Helper function which add hydrogen atoms to the molecule graph. - First the hcount attribute produced by pysmiles us updated, because + First the hcount attribute produced by pysmiles is updated, because fragments have no bonds at time of reading so pysmiles does not know the connectivity. Hence the hcount is redone based on the actual connectivity of the final molecule. diff --git a/cgsmiles/resolve.py b/cgsmiles/resolve.py index 7c6aec5..4498bbe 100644 --- a/cgsmiles/resolve.py +++ b/cgsmiles/resolve.py @@ -202,8 +202,7 @@ def squash_atoms(self): self_loops=False) # add the fragment id of the sequashed node - self.molecule.nodes[node_to_keep]['fragid'] +=\ - self.molecule.nodes[node_to_keep]['contraction'][node_to_remove]['fragid'] + self.molecule.nodes[node_to_keep]['fragid'] += self.molecule.nodes[node_to_keep]['contraction'][node_to_remove]['fragid'] def resolve(self): From d6046f4c264f554f990078936213f34a3fbe970a Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Sun, 5 May 2024 14:28:10 +0200 Subject: [PATCH 16/19] allow bond orders based on SMILES syntax for bonding descrip --- cgsmiles/read_fragments.py | 46 +++++++++++++++++++++++-- cgsmiles/resolve.py | 6 ++-- cgsmiles/tests/test_cgsmile_parsing.py | 46 ++++++++++++++----------- cgsmiles/tests/test_molecule_resolve.py | 33 ++++++++++++++++-- 4 files changed, 101 insertions(+), 30 deletions(-) diff --git a/cgsmiles/read_fragments.py b/cgsmiles/read_fragments.py index e191a85..ff1417a 100644 --- a/cgsmiles/read_fragments.py +++ b/cgsmiles/read_fragments.py @@ -6,6 +6,35 @@ import pysmiles from .read_cgsmiles import read_cgsmiles +class PeekIter(object): + """ + Custom iter that allows looking ahead, without + advancing the actual iter. + """ + def __init__(self, collection): + self.collection = collection + self.index = 0 + + def __next__(self): + try: + result = self.collection[self.index] + self.index += 1 + except IndexError: + raise StopIteration + return result + + def peek(self): + try: + result = self.collection[self.index] + except IndexError: + return "" + return result + + def __iter__(self): + self.index = 0 + return self + + def strip_bonding_descriptors(fragment_string): """ Processes a CGBigSmile fragment string by @@ -27,11 +56,13 @@ def strip_bonding_descriptors(fragment_string): a dict mapping bonding descriptors to the nodes within the string """ - smile_iter = iter(fragment_string) + bond_to_order = {'-': 1, '=': 2, '#': 3, '$': 4, ':': 1.5, '.': 0} + smile_iter = PeekIter(fragment_string) bonding_descrpt = defaultdict(list) smile = "" node_count = 0 prev_node = 0 + current_order = None for token in smile_iter: if token == '[': peek = next(smile_iter) @@ -41,7 +72,14 @@ def strip_bonding_descriptors(fragment_string): while peek != ']': bond_descrp += peek peek = next(smile_iter) - bonding_descrpt[prev_node].append(bond_descrp) + if smile_iter.peek() in bond_to_order: + order = bond_to_order[next(smile_iter)] + elif current_order: + order = current_order + current_order = None + else: + order = 1 + bonding_descrpt[prev_node].append(bond_descrp + str(order)) else: atom = token while peek != ']': @@ -59,8 +97,10 @@ def strip_bonding_descriptors(fragment_string): elif token == ')': prev_node = anchor smile += token + elif token in bond_to_order: + current_order = bond_to_order[token] else: - if token not in '] H @ . - = # $ : / \\ + - %'\ + if token not in '] H @ $ / \\ + - %'\ and not token.isdigit(): prev_node = node_count node_count += 1 diff --git a/cgsmiles/resolve.py b/cgsmiles/resolve.py index 4498bbe..a0a373d 100644 --- a/cgsmiles/resolve.py +++ b/cgsmiles/resolve.py @@ -21,7 +21,7 @@ def compatible(left, right): ------- bool """ - if left == right and left not in '> <': + if left == right and left[0] not in '> <': return True l, r = left[0], right[0] if (l, r) == ('<', '>') or (l, r) == ('>', '<'): @@ -177,9 +177,7 @@ def edges_from_bonding_descrpt(self): # bonding descriptors are assumed to have bonding order 1 # unless they are specifically annotated - order = re.findall("\d+\.\d+", bonding[0]) - if not order: - order = 1 + order = int(bonding[0][-1]) self.molecule.add_edge(edge[0], edge[1], bonding=bonding, order=order) def squash_atoms(self): diff --git a/cgsmiles/tests/test_cgsmile_parsing.py b/cgsmiles/tests/test_cgsmile_parsing.py index f7324c5..161319a 100644 --- a/cgsmiles/tests/test_cgsmile_parsing.py +++ b/cgsmiles/tests/test_cgsmile_parsing.py @@ -127,31 +127,35 @@ def test_read_cgsmiles(smile, nodes, edges): # smiple symmetric bonding ("[$]COC[$]", "COC", - {0: ["$"], 2: ["$"]}), + {0: ["$1"], 2: ["$1"]}), + # smiple symmetric bonding with more than one name + ("[$1A]COC[$1A]", + "COC", + {0: ["$1A1"], 2: ["$1A1"]}), # simple symmetric but with explicit hydrogen ("[$][CH2]O[CH2][$]", "[CH2]O[CH2]", - {0: ["$"], 2: ["$"]}), + {0: ["$1"], 2: ["$1"]}), # smiple symmetric bonding; multiple descript ("[$]COC[$][$1]", "COC", - {0: ["$"], 2: ["$", "$1"]}), + {0: ["$1"], 2: ["$1", "$11"]}), # named different bonding descriptors ("[$1]CCCC[$2]", "CCCC", - {0: ["$1"], 3: ["$2"]}), + {0: ["$11"], 3: ["$21"]}), # ring and bonding descriptors ("[$1]CC[$2]C1CCCCC1", "CCC1CCCCC1", - {0: ["$1"], 1: ["$2"]}), + {0: ["$11"], 1: ["$21"]}), # bonding descript. after branch ("C(COC[$1])[$2]CCC[$3]", "C(COC)CCC", - {0: ["$2"], 3: ["$1"], 6: ["$3"]}), + {0: ["$21"], 3: ["$11"], 6: ["$31"]}), # left rigth bonding desciptors ("[>]COC[<]", "COC", - {0: [">"], 2: ["<"]}) + {0: [">1"], 2: ["<1"]}) )) def test_strip_bonding_descriptors(big_smile, smile, bonding): new_smile, new_bonding = strip_bonding_descriptors(big_smile) @@ -161,50 +165,50 @@ def test_strip_bonding_descriptors(big_smile, smile, bonding): @pytest.mark.parametrize('fragment_str, nodes, edges',( # single fragment ("{#PEO=[$]COC[$]}", - {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$"], "element": "C"}), + {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C"}), (1, {"atomname": "O1", "fragname": "PEO", "element": "O"}), - (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$"], "element": "C"}), + (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C"}), )}, {"PEO": [(0, 1), (1, 2)]}), # single fragment but with explicit hydrogen in smiles ("{#PEO=[$][CH2]O[CH2][$]}", - {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$"], "element": "C"}), + {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C"}), (1, {"atomname": "O1", "fragname": "PEO", "element": "O"}), - (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$"], "element": "C"}), + (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C"}), )}, {"PEO": [(0, 1), (1, 2)]}), # test NH3 terminal ("{#AMM=N[$]}", - {"AMM": ((0, {"atomname": "N0", "fragname": "AMM", "bonding": ["$"], "element": "N"}), + {"AMM": ((0, {"atomname": "N0", "fragname": "AMM", "bonding": ["$1"], "element": "N"}), )}, {"AMM": []}), # single fragment + 1 terminal (i.e. only 1 bonding descrpt ("{#PEO=[$]COC[$],#OHter=[$][OH]}", - {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$"], "element": "C"}), + {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C"}), (1, {"atomname": "O1", "fragname": "PEO", "element": "O"}), - (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$"], "element": "C"}), + (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C"}), ), - "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$"], "element": "O"}),)}, + "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O"}),)}, {"PEO": [(0, 1), (1, 2)], "OHter": []}), # single fragment + 1 terminal but multiple bond descritp. # this adjust the hydrogen count ("{#PEO=[$]COC[$][$1],#OHter=[$][OH]}", - {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$"], "element": "C"}), + {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C"}), (1, {"atomname": "O1", "fragname": "PEO", "element": "O"}), - (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$", "$1"], "element": "C"}), + (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1", "$11"], "element": "C"}), ), - "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$"], "element": "O"}),)}, + "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O"}),)}, {"PEO": [(0, 1), (1, 2)], "OHter": []}), # single fragment + 1 terminal but multiple bond descritp. # but explicit hydrogen in the smiles string ("{#PEO=[$][CH2]O[CH2][$][$1],#OHter=[$][OH]}", - {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$"], "element": "C"}), + {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C"}), (1, {"atomname": "O1", "fragname": "PEO", "element": "O"}), - (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$", "$1"], "element": "C"}), + (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1", "$11"], "element": "C"}), ), - "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$"], "element": "O"}), + "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O"}), )}, {"PEO": [(0, 1), (1, 2),], "OHter": []}), diff --git a/cgsmiles/tests/test_molecule_resolve.py b/cgsmiles/tests/test_molecule_resolve.py index 43b5b8b..2c481c8 100644 --- a/cgsmiles/tests/test_molecule_resolve.py +++ b/cgsmiles/tests/test_molecule_resolve.py @@ -24,6 +24,11 @@ {0: ['>'], 1: ['$5']}, (3, 0), ('<', '>')), + # left right with annotated > < + ({0: ['$'], 1: ['>1'], 3: ['<1']}, + {0: ['>1'], 1: ['$5']}, + (3, 0), + ('<1', '>1')), # left right selective bonding # with identifier ({0: ['$'], 1: ['>'], 3: ['<1']}, @@ -53,6 +58,16 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes): [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7), (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13), (11, 14), (11, 15), (11, 16), (16, 17)]), + # smiple linear seqeunce with bond-order in link + ("{[#TC1][#TC4][#TC1]}.{#TC1=[$1]=CC=[$2],#TC4=[$1]=CC=[$2]}", + # 0 1 2 3 4 5 6 7 8 9 + [('TC1', 'C C H H H H'), ('TC4', 'C C H H'), + # 10 11 12 13 14 15 + ('TC1', 'C C H H H H')], + 'C C H H H H C C H H C C H H H H', + [(0, 1), (0, 2), (1, 3), (1, 4), (1, 5), (0, 6), (6, 7), + (6, 8), (7, 9), (7, 11), (10, 11), (10, 12), (10, 13), + (10, 14), (11, 15)]), # smiple linear seqeunce unconsumed bonding descrpt ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]CO[>]C[$],#OHter=[$][O]}", # 0 1 2 3 4 5 6 7 8 @@ -73,6 +88,16 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes): [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7), (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13), (11, 14), (11, 15), (11, 16), (16, 17)]), + # smiple linear seqeunce with ionic ending + ("{[#OH][#PEO]|2[#ON]}.{#PEO=[$]COC[$],#OH=[$]O,#ON=[$][O-]}", + # 0 1 2 3 4 5 6 7 8 + [('OH', 'O H'), ('PEO', 'C O C H H H H'), + # 9 10 11 12 13 14 15 16 17 + ('PEO', 'C O C H H H H'), ('ON', 'O')], + 'O H C O C H H H H C O C H H H H O', + [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7), + (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13), + (11, 14), (11, 15), (11, 16)]), # uncomsumed bonding IDs; note that this is not the same # molecule as previous test case. Here one of the OH branches # and replaces an CH2 group with CH-OH @@ -82,9 +107,9 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes): # 9 10 11 12 13 14 15 16 17 ('PEO', 'C O C H H H H'), ('OHter', 'O H')], 'O H C O C H H H H C O C H H H H O H', - [(0, 1), (0, 2), (2, 3), (2, 5), (2, 9), (3, 4), + [(0, 1), (0, 2), (2, 3), (2, 5), (2, 11), (3, 4), (4, 6), (4, 7), (4, 8), (9, 10), (9, 12), (9, 13), - (10, 11), (11, 15), (11, 14), (11, 16), (16, 17)]), + (10, 11), (11, 15), (11, 14), (9, 16), (16, 17)]), # simple branched sequence ("{[#Hter][#PE]([#PEO][#Hter])[#PE]([#PEO][#Hter])[#Hter]}.{#Hter=[$]H,#PE=[$]CC[$][$],#PEO=[$]COC[$]}", [('Hter', 'H'), ('PE', 'C C H H H'), ('PEO', 'C O C H H H H'), ('Hter', 'H'), @@ -173,5 +198,9 @@ def test_all_atom_resolve_molecule(smile, ref_frags, elements, ref_edges): def _ele_match(n1, n2): return n1["element"] == n2["element"] + print(smile) + print(ref_graph.edges) + print(molecule.edges) + assert ref_graph.edges == molecule.edges # check that reference graph and molecule are isomorphic assert nx.is_isomorphic(ref_graph, molecule, node_match=_ele_match) From 2bf7c7991b2a53df2d2dc4c5c3404e1b8b0a493f Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Sun, 5 May 2024 14:51:14 +0200 Subject: [PATCH 17/19] add hcounts --- cgsmiles/tests/test_cgsmile_parsing.py | 36 +++++++++++++------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/cgsmiles/tests/test_cgsmile_parsing.py b/cgsmiles/tests/test_cgsmile_parsing.py index 161319a..8864216 100644 --- a/cgsmiles/tests/test_cgsmile_parsing.py +++ b/cgsmiles/tests/test_cgsmile_parsing.py @@ -165,28 +165,28 @@ def test_strip_bonding_descriptors(big_smile, smile, bonding): @pytest.mark.parametrize('fragment_str, nodes, edges',( # single fragment ("{#PEO=[$]COC[$]}", - {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C"}), - (1, {"atomname": "O1", "fragname": "PEO", "element": "O"}), - (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C"}), + {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3}), + (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0}), + (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3}), )}, {"PEO": [(0, 1), (1, 2)]}), # single fragment but with explicit hydrogen in smiles ("{#PEO=[$][CH2]O[CH2][$]}", - {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C"}), - (1, {"atomname": "O1", "fragname": "PEO", "element": "O"}), - (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C"}), + {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2}), + (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0}), + (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2}), )}, {"PEO": [(0, 1), (1, 2)]}), # test NH3 terminal ("{#AMM=N[$]}", - {"AMM": ((0, {"atomname": "N0", "fragname": "AMM", "bonding": ["$1"], "element": "N"}), + {"AMM": ((0, {"atomname": "N0", "fragname": "AMM", "bonding": ["$1"], "element": "N", "hcount": 3}), )}, {"AMM": []}), # single fragment + 1 terminal (i.e. only 1 bonding descrpt ("{#PEO=[$]COC[$],#OHter=[$][OH]}", - {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C"}), - (1, {"atomname": "O1", "fragname": "PEO", "element": "O"}), - (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C"}), + {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3}), + (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0}), + (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3}), ), "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O"}),)}, {"PEO": [(0, 1), (1, 2)], @@ -194,21 +194,21 @@ def test_strip_bonding_descriptors(big_smile, smile, bonding): # single fragment + 1 terminal but multiple bond descritp. # this adjust the hydrogen count ("{#PEO=[$]COC[$][$1],#OHter=[$][OH]}", - {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C"}), - (1, {"atomname": "O1", "fragname": "PEO", "element": "O"}), - (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1", "$11"], "element": "C"}), + {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3}), + (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0}), + (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1", "$11"], "element": "C", "hcount": 3}), ), - "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O"}),)}, + "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "hcount": 1}),)}, {"PEO": [(0, 1), (1, 2)], "OHter": []}), # single fragment + 1 terminal but multiple bond descritp. # but explicit hydrogen in the smiles string ("{#PEO=[$][CH2]O[CH2][$][$1],#OHter=[$][OH]}", - {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C"}), - (1, {"atomname": "O1", "fragname": "PEO", "element": "O"}), - (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1", "$11"], "element": "C"}), + {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2}), + (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0}), + (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1", "$11"], "element": "C", "hcount": 2}), ), - "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O"}), + "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "hcount": 1}), )}, {"PEO": [(0, 1), (1, 2),], "OHter": []}), From 823fcec75bbf7d111dae37b4d798a63a28349d03 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Sun, 5 May 2024 15:53:58 +0200 Subject: [PATCH 18/19] use _valence function to get valences --- cgsmiles/pysmiles_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cgsmiles/pysmiles_utils.py b/cgsmiles/pysmiles_utils.py index 677a7bb..5c9f4e5 100644 --- a/cgsmiles/pysmiles_utils.py +++ b/cgsmiles/pysmiles_utils.py @@ -37,7 +37,7 @@ def rebuild_h_atoms(mol_graph, keep_bonding=False): bonds = round(sum([mol_graph.edges[(node, neigh)]['order'] for neigh in\ mol_graph.neighbors(node)])) charge = mol_graph.nodes[node].get('charge', 0) - hcount = pysmiles.smiles_helper.VALENCES[ele][0] -\ + hcount = pysmiles.smiles_helper._valence(mol_graph, node, minimum=0) -\ bonds +\ charge # in this case we only rebuild hydrogen atoms that are not From 9b91d31b2cb25a1b687675e938fed3eb3a30c2e5 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Mon, 6 May 2024 15:58:18 +0200 Subject: [PATCH 19/19] add new iterator --- cgsmiles/read_fragments.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/cgsmiles/read_fragments.py b/cgsmiles/read_fragments.py index ff1417a..101ba3b 100644 --- a/cgsmiles/read_fragments.py +++ b/cgsmiles/read_fragments.py @@ -12,26 +12,27 @@ class PeekIter(object): advancing the actual iter. """ def __init__(self, collection): - self.collection = collection - self.index = 0 + self.collection = iter(collection) + self._peek = None def __next__(self): - try: - result = self.collection[self.index] - self.index += 1 - except IndexError: - raise StopIteration - return result + if self._peek: + item = self._peek + self._peek = None + else: + item = next(self.collection) + return item def peek(self): + if self._peek: + return self._peek try: - result = self.collection[self.index] - except IndexError: - return "" - return result + self._peek = next(self) + except StopIteration: + self._peek = None + return self._peek def __iter__(self): - self.index = 0 return self