From 299c03353d5316fee9a647068e1ac32103bd2da4 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Thu, 12 Sep 2024 15:24:38 +0200 Subject: [PATCH 1/7] implement isomerism bug fixes plus tests --- cgsmiles/graph_utils.py | 27 ++++++++- cgsmiles/pysmiles_utils.py | 27 +++++++++ cgsmiles/read_fragments.py | 30 ++++++++-- cgsmiles/resolve.py | 17 ++++-- cgsmiles/tests/test_cgsmile_parsing.py | 76 ++++++++++++++++++++---- cgsmiles/tests/test_molecule_resolve.py | 79 ++++++++++++++++++++----- 6 files changed, 217 insertions(+), 39 deletions(-) diff --git a/cgsmiles/graph_utils.py b/cgsmiles/graph_utils.py index d2bd0ab..2d83d81 100644 --- a/cgsmiles/graph_utils.py +++ b/cgsmiles/graph_utils.py @@ -44,6 +44,11 @@ def merge_graphs(source_graph, target_graph, max_node=None): correspondence[node] = idx new_atom = copy.copy(target_graph.nodes[node]) new_atom['fragid'] = [(new_atom.get('fragid', 0) + fragment_offset)] + # make sure to propagate the ez isomers + if 'ez_isomer_atoms' in new_atom: + new_atom['ez_isomer_atoms'] = (new_atom['ez_isomer_atoms'][0]+offset+1, + new_atom['ez_isomer_atoms'][1]+offset+1) + # make sure to propagate the ez atoms source_graph.add_node(idx, **new_atom) for node1, node2 in target_graph.edges: @@ -53,7 +58,9 @@ def merge_graphs(source_graph, target_graph, max_node=None): return correspondence -def sort_nodes_by_attr(graph, sort_attr="fragid"): +def sort_nodes_by_attr(graph, + sort_attr="fragid", + relative_attr=[('ez_isomer_atoms', True)]): """ Sort nodes in graph by attribute and relable the graph in place. @@ -63,6 +70,13 @@ def sort_nodes_by_attr(graph, sort_attr="fragid"): the graph to sort nodes of sort_attr: `abc.hashable` the attribute to use for sorting + relative_attr: tuple(str, bool) + a list of attributes that are sensetive + to the ordering of nodes (i.e. refer to + other nodes). The second element indicates + the depth. If False the value of attr are + node keys. If True we expect the value to + be an itertable with node keys. Returns ------- @@ -73,6 +87,17 @@ def sort_nodes_by_attr(graph, sort_attr="fragid"): sorted_ids = sorted(attr_values, key=lambda item: (attr_values[item], item)) mapping = {old: new for new, old in enumerate(sorted_ids)} new_graph = nx.relabel_nodes(graph, mapping, copy=True) + for attr, is_list in relative_attr: + attr_dict = nx.get_node_attributes(new_graph, attr) + new_dict = {} + for key, values in attr_dict.items(): + if is_list: + new_values = [mapping[value] for value in values] + else: + new_values = mapping[values] + new_dict[key] = new_values + if new_dict: + nx.set_node_attributes(new_graph, new_dict, attr) return new_graph def _keyfunc(graph, node_idx, attrs): diff --git a/cgsmiles/pysmiles_utils.py b/cgsmiles/pysmiles_utils.py index 08cf4de..7f536cd 100644 --- a/cgsmiles/pysmiles_utils.py +++ b/cgsmiles/pysmiles_utils.py @@ -1,5 +1,8 @@ +import logging import networkx as nx import pysmiles +from pysmiles.smiles_helper import _annotate_ez_isomers +LOGGER = logging.getLogger(__name__) def rebuild_h_atoms(mol_graph, keep_bonding=False): """ @@ -52,3 +55,27 @@ def rebuild_h_atoms(mol_graph, keep_bonding=False): ref_node = next(mol_graph.neighbors(node)) mol_graph.nodes[node]["fragid"] = mol_graph.nodes[ref_node]["fragid"] mol_graph.nodes[node]["fragname"] = mol_graph.nodes[ref_node]["fragname"] + +def annotate_ez_isomers(molecule): + """ + Small wrapper dealing with ez_isomer annotation. + + Parameters + ---------- + molecule: nx.Graph + The molecule of intrest, which must of ez_isomer_pairs + and ez_isomer_class set as node attributes + """ + ez_isomer_atoms = nx.get_node_attributes(molecule, 'ez_isomer_atoms') + ez_isomer_class = nx.get_node_attributes(molecule, 'ez_isomer_class') + ez_isomer_atoms_list = [atoms + [_class] for atoms, _class in zip(ez_isomer_atoms.values(), ez_isomer_class.values())] + ez_isomer_pairs = list(zip(ez_isomer_atoms_list[:-1], ez_isomer_atoms_list[1:])) + if len(ez_isomer_atoms)%2 != 0: + msg = ("You have an uneven amount of atoms marked as CIS/TRANS isomers." + "We will drop the last atom from assigning the iosmers.") + LOGGER.warning(msg) + _annotate_ez_isomers(molecule, ez_isomer_pairs) + # clean up + for node in ez_isomer_atoms: + del molecule.nodes[node]['ez_isomer_atoms'] + del molecule.nodes[node]['ez_isomer_class'] diff --git a/cgsmiles/read_fragments.py b/cgsmiles/read_fragments.py index c07e9c4..a72d22f 100644 --- a/cgsmiles/read_fragments.py +++ b/cgsmiles/read_fragments.py @@ -60,6 +60,8 @@ def strip_bonding_descriptors(fragment_string): bond_to_order = {'-': 1, '=': 2, '#': 3, '$': 4, ':': 1.5, '.': 0} smile_iter = PeekIter(fragment_string) bonding_descrpt = defaultdict(list) + ez_isomer_atoms = {} + rs_isomers = {} smile = "" node_count = 0 prev_node = 0 @@ -84,7 +86,14 @@ def strip_bonding_descriptors(fragment_string): else: atom = token while peek != ']': - atom += peek + # deal with rs chirality + if peek == '@': + chiral_token = peek + if smile_iter.peek() == '@': + chiral_token = '@' + next(smile_iter) + rs_isomers[node_count] = (chiral_token, []) + else: + atom += peek peek = next(smile_iter) smile = smile + atom + "]" prev_node = node_count @@ -99,8 +108,15 @@ def strip_bonding_descriptors(fragment_string): elif token in bond_to_order: current_order = bond_to_order[token] smile += token - elif token in '] H @ . - = # $ : / \\ + - %' or token.isdigit(): + elif token in '] H . - = # $ : + - %' or token.isdigit(): smile += token + # deal with ez isomers + elif token in '/ \\': + # we have a branch + if node_count == 0: + ez_isomer_atoms[node_count] = (node_count, 1, token) + else: + ez_isomer_atoms[node_count] = (node_count, prev_node, token) else: if smile_iter.peek() and token + smile_iter.peek() in ['Cl', 'Br', 'Si', 'Mg', 'Na']: smile += (token + next(smile_iter)) @@ -110,7 +126,7 @@ def strip_bonding_descriptors(fragment_string): prev_node = node_count node_count += 1 - return smile, bonding_descrpt + return smile, bonding_descrpt, rs_isomers, ez_isomer_atoms def fragment_iter(fragment_str, all_atom=True): """ @@ -140,7 +156,7 @@ def fragment_iter(fragment_str, all_atom=True): delim = fragment.find('=', 0) fragname = fragment[1:delim] big_smile = fragment[delim+1:] - smile, bonding_descrpt = strip_bonding_descriptors(big_smile) + smile, bonding_descrpt, rs_isomers, ez_isomers = strip_bonding_descriptors(big_smile) if smile == "H": mol_graph = nx.Graph() mol_graph.add_node(0, element="H", bonding=bonding_descrpt[0]) @@ -148,6 +164,12 @@ def fragment_iter(fragment_str, all_atom=True): elif all_atom: mol_graph = pysmiles.read_smiles(smile, reinterpret_aromatic=False, strict=False) nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding') + nx.set_node_attributes(mol_graph, rs_isomers, 'rs_isomer') + # we need to split countable node keys and the associated value + ez_isomer_atoms = {idx: val[:-1] for idx, val in ez_isomers.items()} + ez_isomer_class = {idx: val[-1] for idx, val in ez_isomers.items()} + nx.set_node_attributes(mol_graph, ez_isomer_atoms, 'ez_isomer_atoms') + nx.set_node_attributes(mol_graph, ez_isomer_class, 'ez_isomer_class') # we deal with a CG resolution graph else: mol_graph = read_cgsmiles(smile) diff --git a/cgsmiles/resolve.py b/cgsmiles/resolve.py index 1895da3..afdc2af 100644 --- a/cgsmiles/resolve.py +++ b/cgsmiles/resolve.py @@ -1,13 +1,14 @@ import re import copy import networkx as nx +from pysmiles.smiles_helper import _mark_chiral_atoms from .read_cgsmiles import read_cgsmiles from .read_fragments import read_fragments from .graph_utils import (merge_graphs, sort_nodes_by_attr, annotate_fragments, set_atom_names_atomistic) -from .pysmiles_utils import rebuild_h_atoms +from .pysmiles_utils import rebuild_h_atoms, annotate_ez_isomers def compatible(left, right): """ @@ -330,15 +331,19 @@ def resolve(self): # sort the atoms self.molecule = sort_nodes_by_attr(self.molecule, sort_attr=("fragid")) + if all_atom: + # assign chirality + _mark_chiral_atoms(self.molecule) + # assign rs isomerism + annotate_ez_isomers(self.molecule) + # in all-atom MD there are common naming conventions + # that might be expected and hence we set them here + set_atom_names_atomistic(self.meta_graph, self.molecule) + # and redo the meta molecule self.meta_graph = annotate_fragments(self.meta_graph, self.molecule) - # in all-atom MD there are common naming conventions - # that might be expected and hence we set them here - if all_atom: - set_atom_names_atomistic(self.meta_graph, self.molecule) - # increment the resolution counter self.resolution_counter += 1 diff --git a/cgsmiles/tests/test_cgsmile_parsing.py b/cgsmiles/tests/test_cgsmile_parsing.py index b9db7b6..9323656 100644 --- a/cgsmiles/tests/test_cgsmile_parsing.py +++ b/cgsmiles/tests/test_cgsmile_parsing.py @@ -153,48 +153,100 @@ def test_read_cgsmiles(smile, nodes, edges, orders): fragnames = nx.get_node_attributes(meta_mol, 'fragname') assert nodes == list(fragnames.values()) -@pytest.mark.parametrize('big_smile, smile, bonding',( +@pytest.mark.parametrize('big_smile, smile, bonding, rs, ez',( # smiple symmetric bonding ("[$]COC[$]", "COC", - {0: ["$1"], 2: ["$1"]}), + {0: ["$1"], 2: ["$1"]}, + None, + None), # smiple symmetric bonding with more than one name ("[$1A]COC[$1A]", "COC", - {0: ["$1A1"], 2: ["$1A1"]}), + {0: ["$1A1"], 2: ["$1A1"]}, + None, + None), # smiple bonding multiletter atom ("Clc[$]c[$]", "Clcc", - {1: ["$1"], 2: ["$1"]}), + {1: ["$1"], 2: ["$1"]}, + None, + None), # simple symmetric but with explicit hydrogen ("[$][CH2]O[CH2][$]", "[CH2]O[CH2]", - {0: ["$1"], 2: ["$1"]}), + {0: ["$1"], 2: ["$1"]}, + None, + None), # smiple symmetric bonding; multiple descript ("[$]COC[$][$1]", "COC", - {0: ["$1"], 2: ["$1", "$11"]}), + {0: ["$1"], 2: ["$1", "$11"]}, + None, + None), # named different bonding descriptors ("[$1]CCCC[$2]", "CCCC", - {0: ["$11"], 3: ["$21"]}), + {0: ["$11"], 3: ["$21"]}, + None, + None), # ring and bonding descriptors ("[$1]CC[$2]C1CCCCC1", "CCC1CCCCC1", - {0: ["$11"], 1: ["$21"]}), + {0: ["$11"], 1: ["$21"]}, + None, + None), # bonding descript. after branch ("C(COC[$1])[$2]CCC[$3]", "C(COC)CCC", - {0: ["$21"], 3: ["$11"], 6: ["$31"]}), + {0: ["$21"], 3: ["$11"], 6: ["$31"]}, + None, + None), # left rigth bonding desciptors ("[>]COC[<]", "COC", - {0: [">1"], 2: ["<1"]}) + {0: [">1"], 2: ["<1"]}, + None, + None), + # simple chirality in residue + ("[>]C[C@](F)(B)N[<]", + "C[C](F)(B)N", + {0: [">1"], 4: ["<1"]}, + {1: ('@', [])}, + None), + # simple chirality inverse in residue + ("[>]C[C@@](F)(B)N[<]", + "C[C](F)(B)N", + {0: [">1"], 4: ["<1"]}, + {1: ('@@', [])}, + None), + # \ fragment split + ("[>]CC(\F)=[<]", + "CC(F)=", + {0: [">1"], 1: ["<2"]}, + None, + {2: (2, 1, '\\')}), + # / fragment split + ("[>]CC(/F)=[<]", + "CC(F)=", + {0: [">1"], 1: ["<2"]}, + None, + {2: (2, 1, '/')}), + # both in one fragment + ("[>]CC(/F)=C(\F)C[<]", + "CC(F)=C(F)C", + {0: [">1"], 5: ["<1"]}, + None, + {2: (2, 1, '/'), 4: (4, 3, '\\')}), )) -def test_strip_bonding_descriptors(big_smile, smile, bonding): - new_smile, new_bonding = strip_bonding_descriptors(big_smile) +def test_strip_bonding_descriptors(big_smile, smile, bonding, rs, ez): + new_smile, new_bonding, rs_isomers, ez_isomers = strip_bonding_descriptors(big_smile) assert new_smile == smile assert new_bonding == bonding + if rs: + assert rs == rs_isomers + if ez: + assert ez == ez_isomers @pytest.mark.parametrize('fragment_str, nodes, edges',( # single fragment diff --git a/cgsmiles/tests/test_molecule_resolve.py b/cgsmiles/tests/test_molecule_resolve.py index b424ee0..6ad3fb7 100644 --- a/cgsmiles/tests/test_molecule_resolve.py +++ b/cgsmiles/tests/test_molecule_resolve.py @@ -52,7 +52,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): assert new_btypes == btypes -@pytest.mark.parametrize('smile, ref_frags, elements, ref_edges',( +@pytest.mark.parametrize('smile, ref_frags, elements, ref_edges, chiral, ez',( # smiple linear seqeunce ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]COC[$],#OHter=[$][O]}", # 0 1 2 3 4 5 6 7 8 @@ -62,7 +62,8 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): 'O H C O C H H H H C O C H H H H O H', [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7), (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13), - (11, 14), (11, 15), (11, 16), (16, 17)]), + (11, 14), (11, 15), (11, 16), (16, 17)], + {}, {}), # smiple linear seqeunce with bond-order in link ("{[#TC1][#TC4][#TC1]}.{#TC1=[$1]=CC=[$2],#TC4=[$1]=CC=[$2]}", # 0 1 2 3 4 5 6 7 8 9 @@ -72,7 +73,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): 'C C H H H H C C H H C C H H H H', [(0, 1), (0, 2), (1, 3), (1, 4), (1, 5), (0, 6), (6, 7), (6, 8), (7, 9), (7, 11), (10, 11), (10, 12), (10, 13), - (10, 14), (11, 15)]), + (10, 14), (11, 15)], {}, {}), # smiple linear seqeunce unconsumed bonding descrpt ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]CO[>]C[$],#OHter=[$][O]}", # 0 1 2 3 4 5 6 7 8 @@ -82,7 +83,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): 'O H C O C H H H H C O C H H H H O H', [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7), (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13), - (11, 14), (11, 15), (11, 16), (16, 17)]), + (11, 14), (11, 15), (11, 16), (16, 17)], {}, {}), # smiple linear seqeunce with ionic bond ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]COC[$],#OHter=[$][O-].[Na+]}", # 0 1 2 3 4 5 6 7 8 @@ -92,7 +93,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): 'O Na C O C H H H H C O C H H H H O Na', [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7), (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13), - (11, 14), (11, 15), (11, 16), (16, 17)]), + (11, 14), (11, 15), (11, 16), (16, 17)], {}, {}), # smiple linear seqeunce with ionic ending ("{[#OH][#PEO]|2[#ON]}.{#PEO=[$]COC[$],#OH=[$]O,#ON=[$][O-]}", # 0 1 2 3 4 5 6 7 8 @@ -102,7 +103,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): 'O H C O C H H H H C O C H H H H O', [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7), (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13), - (11, 14), (11, 15), (11, 16)]), + (11, 14), (11, 15), (11, 16)], {}, {}), # uncomsumed bonding IDs; note that this is not the same # molecule as previous test case. Here one of the OH branches # and replaces an CH2 group with CH-OH @@ -114,7 +115,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): 'O H C O C H H H H C O C H H H H O H', [(0, 1), (0, 2), (2, 3), (2, 5), (2, 11), (3, 4), (4, 6), (4, 7), (4, 8), (9, 10), (9, 12), (9, 13), - (10, 11), (11, 15), (11, 14), (9, 16), (16, 17)]), + (10, 11), (11, 15), (11, 14), (9, 16), (16, 17)], {}, {}), # simple branched sequence ("{[#Hter][#PE]([#PEO][#Hter])[#PE]([#PEO][#Hter])[#Hter]}.{#Hter=[$]H,#PE=[$]CC[$][$],#PEO=[$]COC[$]}", [('Hter', 'H'), ('PE', 'C C H H H'), ('PEO', 'C O C H H H H'), ('Hter', 'H'), @@ -122,7 +123,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): 'H C C H H H C O C H H H H H C C H H H C O C H H H H H H', [(0, 1), (1, 2), (1, 3), (1, 4), (2, 5), (2, 6), (2, 14), (6, 7), (6, 9), (6, 10), (7, 8), (8, 11), (8, 12), (8, 13), (14, 15), (14, 16), (14, 17), (15, 18), (15, 19), (15, 27), - (19, 20), (19, 22), (19, 23), (20, 21), (21, 24), (21, 25), (21, 26)]), + (19, 20), (19, 22), (19, 23), (20, 21), (21, 24), (21, 25), (21, 26)], {}, {}), # something with a ring # 012 34567 # 890123456 @@ -135,7 +136,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): (6, 14), (7, 8), (7, 15), (8, 16), (17, 18), (17, 25), (17, 26), (18, 19), (18, 27), (18, 33), (19, 20), (19, 24), (20, 21), (20, 28), (21, 22), (21, 29), (22, 23), (22, 30), - (23, 24), (23, 31), (24, 32)]), + (23, 24), (23, 31), (24, 32)], {}, {}), # something more complicated branched # here we have multiple bonding descriptors # # despite being the same residue we have 3 fragments after adding hydrgens @@ -157,7 +158,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): [('A', 'O H C H H'), ('B', 'C H H C H H H'),], 'O H C H H C H H H', [(0, 1), (0, 2), (2, 3), (2, 4), (2, 5), - (5, 6), (5, 7), (5, 8)]), + (5, 6), (5, 7), (5, 8)], {}, {}), # smiple squash operator; unconsumed operators ("{[#A][#B]}.{#A=OC[!],#B=[$][!]CC}", # 0 1 2 3 4 1 5 3 4 6 7 8 @@ -168,7 +169,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): [('A', 'O H C H H'), ('B', 'C H H C H H H'),], 'O H C H H C H H H', [(0, 1), (0, 2), (2, 3), (2, 4), (2, 5), - (5, 6), (5, 7), (5, 8)]), + (5, 6), (5, 7), (5, 8)], {}, {}), # smiple squash operator; plus connect operator ("{[#A][#B][#C]}.{#A=OC[!],#B=[$][!]CC,#C=[$]O}", # 0 1 2 3 4 1 5 3 4 6 7 8 @@ -179,7 +180,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): [('A', 'O H C H'), ('B', 'C H C H H H'), ('C', 'O H')], 'O H C H C H H H O H', [(0, 1), (0, 2), (2, 3), (2, 4), - (4, 5), (4, 6), (4, 7), (2, 8), (8, 9)]), + (4, 5), (4, 6), (4, 7), (2, 8), (8, 9)], {}, {}), # THF like test case with double edge and squash operator ("{[#A]1[#B]1}.{#A=[!]COC[!],#B=[!]CCCC[!]}", [('A', 'O C C H H H H'), @@ -187,7 +188,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): 'O C C H H H H C C H H H H', [(0, 2), (0, 3), (2, 4), (2, 5), (3, 6), (3, 7), (2, 8), (3, 9), - (8, 9), (9, 12), (9, 13), (8, 10), (8, 11)]), + (8, 9), (9, 12), (9, 13), (8, 10), (8, 11)], {}, {}), # Toluene like test case with squash operator and aromaticity ("{[#SC3]1[#TC5][#TC5]1}.{#SC3=Cc(c[!])c[!],#TC5=[!]ccc[!]}", [('SC3', 'C C H H H C H C H'), @@ -195,19 +196,57 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): 'C C H H H C H C H C H C H C H', [(0, 1), (0, 2), (0, 3), (0, 4), (1, 5), (1, 7), (5, 9), (5, 6), (7, 13), (7, 8), - (9, 11), (9, 10), (11, 13), (11, 12), (13, 14)]), + (9, 11), (9, 10), (11, 13), (11, 12), (13, 14)], {}, {}), + # smiple chirality assigment between fragments + ("{[#A][#B][#C]}.{#A=O[>],#C=O[<],#B=[<]C[C@H][>]C(=O)OC}", + # 0 1 2 3 + [('A', 'O H'), ('B', 'C C C O O C H H H H H H'), + ('C', 'O H')], + 'O H C H H C H C O O C H H H O H', + [(0, 1), (0, 2), (2, 3), (2, 4), + (2, 5), (5, 6), (5, 7), (7, 8), (7, 9), (9, 10), (10, 11), (10, 12), + (10, 13), (5, 14), (14, 15)], + {3: (2, 4, 10, 14)}, {}), + # smiple chirality assigment between fragments inv + ("{[#A][#B][#C]}.{#A=O[>],#C=O[<],#B=[<]C[C@@H][>]C(=O)OC}", + # 0 1 2 3 + [('A', 'O H'), ('B', 'C C C O O C H H H H H H'), + ('C', 'O H')], + 'O H C H H C H C O O C H H H O H', + [(0, 1), (0, 2), (2, 3), (2, 4), + (2, 5), (5, 6), (5, 7), (7, 8), (7, 9), (9, 10), (10, 11), (10, 12), + (10, 13), (5, 14), (14, 15)], + {3: (2, 4, 14, 10)}, {}), + # smiple ez isomerism assigment between fragments inv + ("{[#A][#B]}.{#A=CC(/F)=[$],#B=[$]=C(\F)C}", + [('A', 'C C F H H H'), ('B', 'C F C H H H')], + 'C C F H H H F C C H H H', + [(0, 1), (1, 2), (0, 3), (0, 4), + (0, 5), (1, 7), (7, 6), (7, 8), (8, 9), (8, 10), (8, 11)], + {}, {2: (2, 1, 6, 7, 'trans'), 7: (7, 6, 1, 2, 'trans')}), + # smiple ez isomerism assigment between fragments inv + ("{[#A][#B]}.{#A=CC(/F)=[$],#B=[$]=C(/F)C}", + [('A', 'C C F H H H'), ('B', 'C F C H H H')], + 'C C F H H H F C C H H H', + [(0, 1), (1, 2), (0, 3), (0, 4), + (0, 5), (1, 7), (7, 6), (7, 8), (8, 9), (8, 10), (8, 11)], + {}, {2: (2, 1, 6, 7, 'cis'), 7: (7, 6, 1, 2, 'cis')}), )) -def test_all_atom_resolve_molecule(smile, ref_frags, elements, ref_edges): +def test_all_atom_resolve_molecule(smile, ref_frags, elements, ref_edges, chiral, ez): meta_mol, molecule = MoleculeResolver.from_string(smile).resolve() # loop and compare fragments first + counter=0 for node, ref in zip(meta_mol.nodes, ref_frags): assert meta_mol.nodes[node]['fragname'] == ref[0] block_graph = meta_mol.nodes[node]['graph'] target_elements = nx.get_node_attributes(block_graph, 'element') sorted_elements = [target_elements[key] for key in sorted(target_elements)] + print("-->", sorted_elements) + print("-->",ref[1].split()) assert sorted_elements == ref[1].split() - + print(counter) + counter += 1 # make the full scale reference graph ref_graph = nx.Graph() ref_graph.add_edges_from(ref_edges) @@ -222,6 +261,14 @@ def _ele_match(n1, n2): # check that reference graph and molecule are isomorphic assert nx.is_isomorphic(ref_graph, molecule, node_match=_ele_match) + # check chirality + if chiral: + chiral_assigned = nx.get_node_attributes(molecule, 'rs_isomer') + assert chiral == chiral_assigned + # check ez isomerism + if ez: + ez_assigned = nx.get_node_attributes(molecule, 'ez_isomer') + assert ez == ez_assigned @pytest.mark.parametrize('case, cgsmiles_str, ref_string',( # case 1: here only the meta-graph is described by the From df38245921010aa23b6eb3cc6fbed50630f236fd Mon Sep 17 00:00:00 2001 From: "Dr. Fabian Grunewald" <32294573+fgrunewald@users.noreply.github.com> Date: Thu, 12 Sep 2024 17:28:56 +0200 Subject: [PATCH 2/7] Apply suggestions from code review Co-authored-by: Peter C Kroon --- cgsmiles/graph_utils.py | 8 +++++++- cgsmiles/pysmiles_utils.py | 2 +- cgsmiles/tests/test_molecule_resolve.py | 4 ++-- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/cgsmiles/graph_utils.py b/cgsmiles/graph_utils.py index 2d83d81..96bacdf 100644 --- a/cgsmiles/graph_utils.py +++ b/cgsmiles/graph_utils.py @@ -91,10 +91,16 @@ def sort_nodes_by_attr(graph, attr_dict = nx.get_node_attributes(new_graph, attr) new_dict = {} for key, values in attr_dict.items(): + try: + iter(values) + except TypeError: + is_list = False + else: + is_list = not isinstance(values, str) if is_list: new_values = [mapping[value] for value in values] else: - new_values = mapping[values] + new_values = mapping[values] new_dict[key] = new_values if new_dict: nx.set_node_attributes(new_graph, new_dict, attr) diff --git a/cgsmiles/pysmiles_utils.py b/cgsmiles/pysmiles_utils.py index 7f536cd..bfcc06f 100644 --- a/cgsmiles/pysmiles_utils.py +++ b/cgsmiles/pysmiles_utils.py @@ -63,7 +63,7 @@ def annotate_ez_isomers(molecule): Parameters ---------- molecule: nx.Graph - The molecule of intrest, which must of ez_isomer_pairs + The molecule of interest, which must of ez_isomer_pairs and ez_isomer_class set as node attributes """ ez_isomer_atoms = nx.get_node_attributes(molecule, 'ez_isomer_atoms') diff --git a/cgsmiles/tests/test_molecule_resolve.py b/cgsmiles/tests/test_molecule_resolve.py index 6ad3fb7..a0f146c 100644 --- a/cgsmiles/tests/test_molecule_resolve.py +++ b/cgsmiles/tests/test_molecule_resolve.py @@ -236,14 +236,14 @@ def test_all_atom_resolve_molecule(smile, ref_frags, elements, ref_edges, chiral meta_mol, molecule = MoleculeResolver.from_string(smile).resolve() # loop and compare fragments first - counter=0 + counter = 0 for node, ref in zip(meta_mol.nodes, ref_frags): assert meta_mol.nodes[node]['fragname'] == ref[0] block_graph = meta_mol.nodes[node]['graph'] target_elements = nx.get_node_attributes(block_graph, 'element') sorted_elements = [target_elements[key] for key in sorted(target_elements)] print("-->", sorted_elements) - print("-->",ref[1].split()) + print("-->", ref[1].split()) assert sorted_elements == ref[1].split() print(counter) counter += 1 From 8b1e8a221f81383a7f67a092a55b50161840910d Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Tue, 24 Sep 2024 13:52:43 +0200 Subject: [PATCH 3/7] draft --- cgsmiles/read_fragments.py | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/cgsmiles/read_fragments.py b/cgsmiles/read_fragments.py index 0d4e614..168bda3 100644 --- a/cgsmiles/read_fragments.py +++ b/cgsmiles/read_fragments.py @@ -35,6 +35,28 @@ def peek(self): def __iter__(self): return self +def deal_with_chiral_rings(smile_iter, token, node_count, rings): + """ + """ + multi_ring = False + ring_token = token + partial_str = token + while True: + if multi_ring and token == '%': + rings[ring_token].append(node_count) + elif multi_ring and token.isdigit(): + ring_token += token + elif token == '%': + ring_token += token + multi_ring = True + elif token.isdigit(): + rings[token] = node_count + else: + break + token = next(smile_iter) + partial_str += token + + return smile_iter, token, partial_str, rings def strip_bonding_descriptors(fragment_string): """ @@ -60,6 +82,7 @@ def strip_bonding_descriptors(fragment_string): bond_to_order = {'-': 1, '=': 2, '#': 3, '$': 4, ':': 1.5, '.': 0} smile_iter = PeekIter(fragment_string) bonding_descrpt = defaultdict(list) + rings = defaultdict(list) ez_isomer_atoms = {} rs_isomers = {} smile = "" @@ -92,13 +115,11 @@ def strip_bonding_descriptors(fragment_string): if smile_iter.peek() == '@': chiral_token = '@' + next(smile_iter) rs_isomers[node_count] = (chiral_token, []) - else: - atom += peek peek = next(smile_iter) + smile = smile + atom + "]" prev_node = node_count node_count += 1 - elif token == '(': anchor = prev_node smile += token @@ -108,7 +129,13 @@ def strip_bonding_descriptors(fragment_string): elif token in bond_to_order: current_order = bond_to_order[token] smile += token - elif token in '] H . - = # $ : + - %' or token.isdigit(): + # for chirality assignment we need to collect rings + elif token == '%' or token.isdigit(): + smile_iter, part_str, rings = deal_with_chiral_rings(smile_iter, + node_count, + rings) + smile += part_str + elif token in '] H . - = # $ : + -': smile += token # deal with ez isomers elif token in '/ \\': From 775c14eb4ec778f1ab84293f01a0d2f060c60aab Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Tue, 1 Oct 2024 15:29:09 +0200 Subject: [PATCH 4/7] have rings taken into account when dealing with chirality; account correctly for implicit hydrogens --- cgsmiles/pysmiles_utils.py | 60 ++++++++++++++++++++++++- cgsmiles/read_fragments.py | 43 ++++++++++++++---- cgsmiles/resolve.py | 9 ++-- cgsmiles/tests/test_molecule_resolve.py | 20 ++++++--- 4 files changed, 114 insertions(+), 18 deletions(-) diff --git a/cgsmiles/pysmiles_utils.py b/cgsmiles/pysmiles_utils.py index f7c2480..d2c18bf 100644 --- a/cgsmiles/pysmiles_utils.py +++ b/cgsmiles/pysmiles_utils.py @@ -1,7 +1,11 @@ import logging import networkx as nx import pysmiles -from pysmiles.smiles_helper import _annotate_ez_isomers +from pysmiles.smiles_helper import (_annotate_ez_isomers, + _mark_chiral_atoms, + remove_explicit_hydrogens, + add_explicit_hydrogens) + LOGGER = logging.getLogger(__name__) def compute_mass(input_molecule): @@ -103,3 +107,57 @@ def annotate_ez_isomers(molecule): for node in ez_isomer_atoms: del molecule.nodes[node]['ez_isomer_atoms'] del molecule.nodes[node]['ez_isomer_class'] + +def mark_chiral_atoms(molecule): + """ + For all nodes tagged as chiral, figure out the three + substituents and annotate the node with a tuple that + has the order in which to rotate. This essentially + corresponds to the definition of an improper dihedral + angle centered on the chiral atom. + + Pysmiles treats explicit hydrogen atoms differently + from implicit ones. In cgsmiles at the time when this + function is called all hydrogen atoms have been added + explicitly. However, we need to correct the chirality + assigment for this. + + Note that this means it is not possible to have explicit + hydrogen atoms attached to chiral centers within cgsmiles + to begin with. However, this is a rather edge case. + """ + chiral_nodes = nx.get_node_attributes(molecule, 'rs_isomer') + for node, (direction, rings) in chiral_nodes.items(): + # first the ring atoms in order + # that they were connected then the + # other neighboring atoms in order + bonded_neighbours = sorted(molecule[node]) + neighbours = list(rings) + hstash = None + for neighbour in bonded_neighbours: + if neighbour not in neighbours: + # all hydrogen atoms are explicit in cgsmiles + # BUT in the input of chirality they are usual + # implicit. At this point we need to treat all + # hydrogen atoms as if they were implicit. + if molecule.nodes[neighbour]['element'] != 'H': + neighbours.append(neighbour) + else: + hstash = neighbour + if hstash: + neighbours.insert(1, hstash) + + if len(neighbours) != 4: + # FIXME Tetrahedral Allene-like Systems such as `NC(Br)=[C@]=C(O)C` + msg = (f"Chiral node {node} has {len(neighbours)} neighbors, which " + "is different than the four expected for tetrahedral " + "chirality.") + raise ValueError(msg) + # the default is anti-clockwise sorting indicated by '@' + # in this case the nodes are sorted with increasing + # node index; however @@ means clockwise and the + # order of nodes is reversed (i.e. with decreasing index) + if direction == '@@': + neighbours = [neighbours[0], neighbours[1], neighbours[3], neighbours[2]] + + molecule.nodes[node]['rs_isomer'] = tuple(neighbours) diff --git a/cgsmiles/read_fragments.py b/cgsmiles/read_fragments.py index 168bda3..e1c2558 100644 --- a/cgsmiles/read_fragments.py +++ b/cgsmiles/read_fragments.py @@ -35,12 +35,20 @@ def peek(self): def __iter__(self): return self +def _find_bonded_ring_node(ring_nodes, node): + current = ring_nodes.index(node) + if current%2 == 0: + other = ring_nodes[current+1] + else: + other = ring_nodes[current-1] + return other + def deal_with_chiral_rings(smile_iter, token, node_count, rings): """ """ multi_ring = False ring_token = token - partial_str = token + partial_str = "" while True: if multi_ring and token == '%': rings[ring_token].append(node_count) @@ -49,12 +57,21 @@ def deal_with_chiral_rings(smile_iter, token, node_count, rings): elif token == '%': ring_token += token multi_ring = True + elif multi_ring: + rings[ring_token].append(node_count) + ring_token = "" elif token.isdigit(): - rings[token] = node_count - else: - break - token = next(smile_iter) + rings[token].append(node_count) + partial_str += token + token = smile_iter.peek() + if token and not token.isdigit() and not token == '%': + break + + try: + token = next(smile_iter) + except StopIteration: + break return smile_iter, token, partial_str, rings @@ -115,6 +132,8 @@ def strip_bonding_descriptors(fragment_string): if smile_iter.peek() == '@': chiral_token = '@' + next(smile_iter) rs_isomers[node_count] = (chiral_token, []) + else: + atom += peek peek = next(smile_iter) smile = smile + atom + "]" @@ -131,9 +150,10 @@ def strip_bonding_descriptors(fragment_string): smile += token # for chirality assignment we need to collect rings elif token == '%' or token.isdigit(): - smile_iter, part_str, rings = deal_with_chiral_rings(smile_iter, - node_count, - rings) + smile_iter, token, part_str, rings = deal_with_chiral_rings(smile_iter, + token, + prev_node, + rings) smile += part_str elif token in '] H . - = # $ : + -': smile += token @@ -153,6 +173,12 @@ def strip_bonding_descriptors(fragment_string): prev_node = node_count node_count += 1 + # we need to annotate rings to the chiral isomers + for node in rs_isomers: + for ring_idx, ring_nodes in rings.items(): + if node in ring_nodes: + bonded_node = _find_bonded_ring_node(ring_nodes, node) + rs_isomers[node][1].append(bonded_node) return smile, bonding_descrpt, rs_isomers, ez_isomer_atoms def fragment_iter(fragment_str, all_atom=True): @@ -197,6 +223,7 @@ def fragment_iter(fragment_str, all_atom=True): ez_isomer_class = {idx: val[-1] for idx, val in ez_isomers.items()} nx.set_node_attributes(mol_graph, ez_isomer_atoms, 'ez_isomer_atoms') nx.set_node_attributes(mol_graph, ez_isomer_class, 'ez_isomer_class') + print("hcount", mol_graph.nodes(data='hcount')) # we deal with a CG resolution graph else: mol_graph = read_cgsmiles(smile) diff --git a/cgsmiles/resolve.py b/cgsmiles/resolve.py index c33899d..2376c1f 100644 --- a/cgsmiles/resolve.py +++ b/cgsmiles/resolve.py @@ -1,14 +1,15 @@ import re import copy import networkx as nx -from pysmiles.smiles_helper import _mark_chiral_atoms from .read_cgsmiles import read_cgsmiles from .read_fragments import read_fragments from .graph_utils import (merge_graphs, sort_nodes_by_attr, annotate_fragments, set_atom_names_atomistic) -from .pysmiles_utils import rebuild_h_atoms, annotate_ez_isomers +from .pysmiles_utils import (rebuild_h_atoms, + annotate_ez_isomers, + mark_chiral_atoms) def compatible(left, right, legacy=False): """ @@ -359,8 +360,8 @@ def resolve(self): self.molecule = sort_nodes_by_attr(self.molecule, sort_attr=("fragid")) if all_atom: - # assign chirality - _mark_chiral_atoms(self.molecule) + # assign chiral atoms + mark_chiral_atoms(self.molecule) # assign rs isomerism annotate_ez_isomers(self.molecule) # in all-atom MD there are common naming conventions diff --git a/cgsmiles/tests/test_molecule_resolve.py b/cgsmiles/tests/test_molecule_resolve.py index e9b8f11..b907757 100644 --- a/cgsmiles/tests/test_molecule_resolve.py +++ b/cgsmiles/tests/test_molecule_resolve.py @@ -185,7 +185,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): [(0, 1), (0, 2), (0, 3), (0, 4), (1, 5), (1, 7), (5, 9), (5, 6), (7, 13), (7, 8), (9, 11), (9, 10), (11, 13), (11, 12), (13, 14)], {}, {}), - # smiple chirality assigment between fragments + # simple chirality assigment between fragments ("{[#A][#B][#C]}.{#A=O[>],#C=O[<],#B=[<]C[C@H][>]C(=O)OC}", # 0 1 2 3 [('A', 'O H'), ('B', 'C C C O O C H H H H H H'), @@ -194,8 +194,18 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): [(0, 1), (0, 2), (2, 3), (2, 4), (2, 5), (5, 6), (5, 7), (7, 8), (7, 9), (9, 10), (10, 11), (10, 12), (10, 13), (5, 14), (14, 15)], - {3: (2, 4, 10, 14)}, {}), - # smiple chirality assigment between fragments inv + {3: (2, 10, 4, 14)}, {}), + # simple chirality assigment with rings + ("{[#GLC]}.{#GLC=C([C@@H]1[C@H]([C@@H]([C@H]([C@H](O1)O)O)O)O)O}", + # 0 1 2 3 + [('GLC', 'C C C C C C O O O O O O H H H H H H H H H H H H')], + 'C C C C C C O O O O O O H H H H H H H H H H H H', + [(0, 1), (0, 11), (0, 12), (0, 13), (1, 2), (1, 6), (1, 14), (2, 3), (2, 10), + (2, 15), (3, 4), (3, 9), (3, 16), (4, 5), (4, 8), (4, 17), (5, 6), (5, 7), (5, 18), + (7, 19), (8, 20), (9, 21), (10, 22), (11, 23)], + {1: (6, 14, 2, 0), 2: (1, 15, 3, 10), 3: (2, 16, 9, 4), + 4: (3, 17, 5, 8), 5: (4, 18, 6, 7)}, {}), + # simple chirality assigment between fragments inv ("{[#A][#B][#C]}.{#A=O[>],#C=O[<],#B=[<]C[C@@H][>]C(=O)OC}", # 0 1 2 3 [('A', 'O H'), ('B', 'C C C O O C H H H H H H'), @@ -204,7 +214,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): [(0, 1), (0, 2), (2, 3), (2, 4), (2, 5), (5, 6), (5, 7), (7, 8), (7, 9), (9, 10), (10, 11), (10, 12), (10, 13), (5, 14), (14, 15)], - {3: (2, 4, 14, 10)}, {}), + {3: (2, 10, 14, 4)}, {}), # smiple ez isomerism assigment between fragments inv ("{[#A][#B]}.{#A=CC(/F)=[$],#B=[$]=C(\F)C}", [('A', 'C C F H H H'), ('B', 'C F C H H H')], @@ -212,7 +222,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): [(0, 1), (1, 2), (0, 3), (0, 4), (0, 5), (1, 7), (7, 6), (7, 8), (8, 9), (8, 10), (8, 11)], {}, {2: (2, 1, 6, 7, 'trans'), 7: (7, 6, 1, 2, 'trans')}), - # smiple ez isomerism assigment between fragments inv + # simple ez isomerism assigment between fragments inv ("{[#A][#B]}.{#A=CC(/F)=[$],#B=[$]=C(/F)C}", [('A', 'C C F H H H'), ('B', 'C F C H H H')], 'C C F H H H F C C H H H', From a7a1eda46595eaeb2183ddd2b618b95c0ee1e403 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Tue, 1 Oct 2024 17:30:44 +0200 Subject: [PATCH 5/7] fix dep --- cgsmiles/resolve.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cgsmiles/resolve.py b/cgsmiles/resolve.py index 2376c1f..a61df3e 100644 --- a/cgsmiles/resolve.py +++ b/cgsmiles/resolve.py @@ -366,7 +366,7 @@ def resolve(self): annotate_ez_isomers(self.molecule) # in all-atom MD there are common naming conventions # that might be expected and hence we set them here - set_atom_names_atomistic(self.meta_graph, self.molecule) + set_atom_names_atomistic(self.molecule, self.meta_graph) # and redo the meta molecule self.meta_graph = annotate_fragments(self.meta_graph, From e9a5232d25b3f59b4a8774970c790d6bc61c54ae Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Sat, 5 Oct 2024 12:55:03 +0200 Subject: [PATCH 6/7] address comments --- cgsmiles/pysmiles_utils.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/cgsmiles/pysmiles_utils.py b/cgsmiles/pysmiles_utils.py index d2c18bf..83051bd 100644 --- a/cgsmiles/pysmiles_utils.py +++ b/cgsmiles/pysmiles_utils.py @@ -133,7 +133,7 @@ def mark_chiral_atoms(molecule): # other neighboring atoms in order bonded_neighbours = sorted(molecule[node]) neighbours = list(rings) - hstash = None + hstash = [] for neighbour in bonded_neighbours: if neighbour not in neighbours: # all hydrogen atoms are explicit in cgsmiles @@ -143,9 +143,13 @@ def mark_chiral_atoms(molecule): if molecule.nodes[neighbour]['element'] != 'H': neighbours.append(neighbour) else: - hstash = neighbour - if hstash: - neighbours.insert(1, hstash) + hstash.append(neighbour) + if hstash and len(hstash) == 1: + neighbours.insert(1, hstash[0]) + elif len(hstash) > 1: + msg = (f"Chiral node {node} as more than 1 hydrogen neighbor." + "Therefore it is not chiral.") + raise ValueError(msg) if len(neighbours) != 4: # FIXME Tetrahedral Allene-like Systems such as `NC(Br)=[C@]=C(O)C` From 3d58e50ed309f0a632ef407e684108bfb7641ad0 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Sat, 5 Oct 2024 12:55:16 +0200 Subject: [PATCH 7/7] address comments --- cgsmiles/read_fragments.py | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/cgsmiles/read_fragments.py b/cgsmiles/read_fragments.py index e1c2558..dafef22 100644 --- a/cgsmiles/read_fragments.py +++ b/cgsmiles/read_fragments.py @@ -43,8 +43,28 @@ def _find_bonded_ring_node(ring_nodes, node): other = ring_nodes[current-1] return other -def deal_with_chiral_rings(smile_iter, token, node_count, rings): +def collect_ring_number(smile_iter, token, node_count, rings): """ + When a ring identifier is found, this function will add + the current node to the rings dict. + + Parameters + ---------- + smile_iter: :class:PeekIter + token: str + node_count: int + rings: dict[list] + + Returns + ------- + PeekIter + the advanced smiles_iter + str + the current token being processed + str + the ring id + dict[list] + the updated rings dict """ multi_ring = False ring_token = token @@ -150,10 +170,10 @@ def strip_bonding_descriptors(fragment_string): smile += token # for chirality assignment we need to collect rings elif token == '%' or token.isdigit(): - smile_iter, token, part_str, rings = deal_with_chiral_rings(smile_iter, - token, - prev_node, - rings) + smile_iter, token, part_str, rings = collect_ring_number(smile_iter, + token, + prev_node, + rings) smile += part_str elif token in '] H . - = # $ : + -': smile += token