From d627ac22f171f398bb630fd6a0b462a76b74ea41 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Mon, 14 Oct 2024 18:37:10 +0200 Subject: [PATCH 01/16] implement weight annotation --- cgsmiles/pysmiles_utils.py | 5 ++ cgsmiles/read_fragments.py | 42 +++++++++++++-- cgsmiles/tests/test_molecule_resolve.py | 68 +++++++++++++++++-------- 3 files changed, 91 insertions(+), 24 deletions(-) diff --git a/cgsmiles/pysmiles_utils.py b/cgsmiles/pysmiles_utils.py index 83051bd..419efb3 100644 --- a/cgsmiles/pysmiles_utils.py +++ b/cgsmiles/pysmiles_utils.py @@ -83,6 +83,11 @@ def rebuild_h_atoms(mol_graph, keep_bonding=False): ref_node = next(mol_graph.neighbors(node)) mol_graph.nodes[node]["fragid"] = mol_graph.nodes[ref_node]["fragid"] mol_graph.nodes[node]["fragname"] = mol_graph.nodes[ref_node]["fragname"] + if mol_graph.nodes[node].get("element", "*") == "H": + # make sure the weights are copied for implicit h-atoms + anchor = list(mol_graph.neighbors(node))[0] + weight = mol_graph.nodes[anchor].get("weight", 1) + mol_graph.nodes[node]["weight"] = weight def annotate_ez_isomers(molecule): """ diff --git a/cgsmiles/read_fragments.py b/cgsmiles/read_fragments.py index f065317..fa28829 100644 --- a/cgsmiles/read_fragments.py +++ b/cgsmiles/read_fragments.py @@ -95,6 +95,31 @@ def collect_ring_number(smile_iter, token, node_count, rings): return smile_iter, token, partial_str, rings +def get_weight(smile_iter): + """ + Extracts weights given to atoms/nodes in + fragments. The iter should be advanced + up to the weight marker ;. + + Parameters + ---------- + smile_iter: class.PeekIter + + Returns + ------- + float: + the weight + PeekIter + the advanced iter object + """ + num = [] + for digit in smile_iter: + num.append(digit) + if smile_iter.peek() in [']', '@', 'H']: + break + out = float("".join(num)) + return out, smile_iter + def strip_bonding_descriptors(fragment_string): """ Processes a CGSmiles fragment string by @@ -122,6 +147,7 @@ def strip_bonding_descriptors(fragment_string): rings = defaultdict(list) ez_isomer_atoms = {} rs_isomers = {} + weights = {} smile = "" node_count = 0 prev_node = 0 @@ -147,6 +173,8 @@ def strip_bonding_descriptors(fragment_string): bonding_descrpt[prev_node].append(bond_descrp + str(order)) else: atom = token + # set the default weight + weights[node_count] = 1 while peek != ']': # deal with rs chirality if peek == '@': @@ -154,6 +182,10 @@ def strip_bonding_descriptors(fragment_string): if smile_iter.peek() == '@': chiral_token = '@' + next(smile_iter) rs_isomers[node_count] = (chiral_token, []) + # we have weights + elif peek == ';': + weight, smile_iter = get_weight(smile_iter) + weights[node_count] = weight else: atom += peek peek = next(smile_iter) @@ -193,6 +225,8 @@ def strip_bonding_descriptors(fragment_string): smile += token current_order = None prev_node = node_count + # set default weight + weights[node_count] = 1 node_count += 1 # we need to annotate rings to the chiral isomers @@ -201,7 +235,8 @@ def strip_bonding_descriptors(fragment_string): if node in ring_nodes: bonded_node = _find_bonded_ring_node(ring_nodes, node) rs_isomers[node][1].append(bonded_node) - return smile, bonding_descrpt, rs_isomers, ez_isomer_atoms + + return smile, bonding_descrpt, rs_isomers, ez_isomer_atoms, weights def fragment_iter(fragment_str, all_atom=True): """ @@ -230,8 +265,8 @@ def fragment_iter(fragment_str, all_atom=True): for fragment in fragment_str[1:-1].split(','): delim = fragment.find('=', 0) fragname = fragment[1:delim] - big_smile = fragment[delim+1:] - smile, bonding_descrpt, rs_isomers, ez_isomers = strip_bonding_descriptors(big_smile) + frag_smile = fragment[delim+1:] + smile, bonding_descrpt, rs_isomers, ez_isomers, weights = strip_bonding_descriptors(frag_smile) if smile == "H": mol_graph = nx.Graph() mol_graph.add_node(0, element="H", bonding=bonding_descrpt[0]) @@ -258,6 +293,7 @@ def fragment_iter(fragment_str, all_atom=True): nx.set_node_attributes(mol_graph, fragname, 'fragname') nx.set_node_attributes(mol_graph, 0, 'fragid') + nx.set_node_attributes(mol_graph, weights, 'weight') yield fragname, mol_graph def read_fragments(fragment_str, all_atom=True, fragment_dict=None): diff --git a/cgsmiles/tests/test_molecule_resolve.py b/cgsmiles/tests/test_molecule_resolve.py index afdbf89..14b1d70 100644 --- a/cgsmiles/tests/test_molecule_resolve.py +++ b/cgsmiles/tests/test_molecule_resolve.py @@ -40,7 +40,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): assert new_btypes == btypes -@pytest.mark.parametrize('smile, ref_frags, elements, ref_edges, chiral, ez',( +@pytest.mark.parametrize('smile, ref_frags, elements, ref_edges, chiral, ez, weights',( # smiple linear seqeunce ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]COC[$],#OHter=[$][O]}", # 0 1 2 3 4 5 6 7 8 @@ -51,7 +51,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7), (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13), (11, 14), (11, 15), (11, 16), (16, 17)], - {}, {}), + {}, {}, {}), # smiple linear seqeunce with bond-order in link ("{[#TC1][#TC4][#TC1]}.{#TC1=[$1]=CC=[$2],#TC4=[$1]=CC=[$2]}", # 0 1 2 3 4 5 6 7 8 9 @@ -61,7 +61,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): 'C C H H H H C C H H C C H H H H', [(0, 1), (0, 2), (1, 3), (1, 4), (1, 5), (0, 6), (6, 7), (6, 8), (7, 9), (7, 11), (10, 11), (10, 12), (10, 13), - (10, 14), (11, 15)], {}, {}), + (10, 14), (11, 15)], {}, {}, {}), # smiple linear seqeunce unconsumed bonding descrpt ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]CO[>]C[$],#OHter=[$][O]}", # 0 1 2 3 4 5 6 7 8 @@ -71,7 +71,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): 'O H C O C H H H H C O C H H H H O H', [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7), (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13), - (11, 14), (11, 15), (11, 16), (16, 17)], {}, {}), + (11, 14), (11, 15), (11, 16), (16, 17)], {}, {}, {}), # smiple linear seqeunce with ionic bond ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]COC[$],#OHter=[$][O-].[Na+]}", # 0 1 2 3 4 5 6 7 8 @@ -81,7 +81,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): 'O Na C O C H H H H C O C H H H H O Na', [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7), (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13), - (11, 14), (11, 15), (11, 16), (16, 17)], {}, {}), + (11, 14), (11, 15), (11, 16), (16, 17)], {}, {}, {}), # smiple linear seqeunce with ionic ending ("{[#OH][#PEO]|2[#ON]}.{#PEO=[$]COC[$],#OH=[$]O,#ON=[$][O-]}", # 0 1 2 3 4 5 6 7 8 @@ -91,7 +91,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): 'O H C O C H H H H C O C H H H H O', [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7), (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13), - (11, 14), (11, 15), (11, 16)], {}, {}), + (11, 14), (11, 15), (11, 16)], {}, {}, {}), # uncomsumed bonding IDs; note that this is not the same # molecule as previous test case. Here one of the OH branches # and replaces an CH2 group with CH-OH @@ -103,7 +103,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): 'O H C O C H H H H C O C H H H H O H', [(0, 1), (0, 2), (2, 3), (2, 5), (2, 11), (3, 4), (4, 6), (4, 7), (4, 8), (9, 10), (9, 12), (9, 13), - (10, 11), (11, 15), (11, 14), (9, 16), (16, 17)], {}, {}), + (10, 11), (11, 15), (11, 14), (9, 16), (16, 17)], {}, {}, {}), # simple branched sequence ("{[#Hter][#PE]([#PEO][#Hter])[#PE]([#PEO][#Hter])[#Hter]}.{#Hter=[$]H,#PE=[$]CC[$][$],#PEO=[$]COC[$]}", [('Hter', 'H'), ('PE', 'C C H H H'), ('PEO', 'C O C H H H H'), ('Hter', 'H'), @@ -111,7 +111,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): 'H C C H H H C O C H H H H H C C H H H C O C H H H H H H', [(0, 1), (1, 2), (1, 3), (1, 4), (2, 5), (2, 6), (2, 14), (6, 7), (6, 9), (6, 10), (7, 8), (8, 11), (8, 12), (8, 13), (14, 15), (14, 16), (14, 17), (15, 18), (15, 19), (15, 27), - (19, 20), (19, 22), (19, 23), (20, 21), (21, 24), (21, 25), (21, 26)], {}, {}), + (19, 20), (19, 22), (19, 23), (20, 21), (21, 24), (21, 25), (21, 26)], {}, {}, {}), # something with a ring # 012 34567 # 890123456 @@ -124,7 +124,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): (6, 14), (7, 8), (7, 15), (8, 16), (17, 18), (17, 25), (17, 26), (18, 19), (18, 27), (18, 33), (19, 20), (19, 24), (20, 21), (20, 28), (21, 22), (21, 29), (22, 23), (22, 30), - (23, 24), (23, 31), (24, 32)], {}, {}), + (23, 24), (23, 31), (24, 32)], {}, {}, {}), # something more complicated branched # here we have multiple bonding descriptors # # despite being the same residue we have 3 fragments after adding hydrgens @@ -146,7 +146,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): [('A', 'O H C H H'), ('B', 'C H H C H H H'),], 'O H C H H C H H H', [(0, 1), (0, 2), (2, 3), (2, 4), (2, 5), - (5, 6), (5, 7), (5, 8)], {}, {}), + (5, 6), (5, 7), (5, 8)], {}, {}, {}), # smiple squash operator; unconsumed operators ("{[#A][#B]}.{#A=OC[!],#B=[$][!]CC}", # 0 1 2 3 4 1 5 3 4 6 7 8 @@ -157,7 +157,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): [('A', 'O H C H H'), ('B', 'C H H C H H H'),], 'O H C H H C H H H', [(0, 1), (0, 2), (2, 3), (2, 4), (2, 5), - (5, 6), (5, 7), (5, 8)], {}, {}), + (5, 6), (5, 7), (5, 8)], {}, {}, {}), # smiple squash operator; plus connect operator ("{[#A][#B][#C]}.{#A=OC[!],#B=[$][!]CC,#C=[$]O}", # 0 1 2 3 4 1 5 3 4 6 7 8 @@ -168,7 +168,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): [('A', 'O H C H'), ('B', 'C H C H H H'), ('C', 'O H')], 'O H C H C H H H O H', [(0, 1), (0, 2), (2, 3), (2, 4), - (4, 5), (4, 6), (4, 7), (2, 8), (8, 9)], {}, {}), + (4, 5), (4, 6), (4, 7), (2, 8), (8, 9)], {}, {}, {}), # THF like test case with double edge and squash operator ("{[#A]=[#B]}.{#A=[!]COC[!],#B=[!]CCCC[!]}", [('A', 'O C C H H H H'), @@ -176,7 +176,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): 'O C C H H H H C C H H H H', [(0, 2), (0, 3), (2, 4), (2, 5), (3, 6), (3, 7), (2, 8), (3, 9), - (8, 9), (9, 12), (9, 13), (8, 10), (8, 11)], {}, {}), + (8, 9), (9, 12), (9, 13), (8, 10), (8, 11)], {}, {}, {}), # Toluene like test case with squash operator and aromaticity ("{[#SC3]1[#TC5][#TC5]1}.{#SC3=Cc(c[!])c[!],#TC5=[!]ccc[!]}", [('SC3', 'C C H H H C H C H'), @@ -184,7 +184,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): 'C C H H H C H C H C H C H C H', [(0, 1), (0, 2), (0, 3), (0, 4), (1, 5), (1, 7), (5, 9), (5, 6), (7, 13), (7, 8), - (9, 11), (9, 10), (11, 13), (11, 12), (13, 14)], {}, {}), + (9, 11), (9, 10), (11, 13), (11, 12), (13, 14)], {}, {}, {}), # simple chirality assigment with rings ("{[#GLC]}.{#GLC=C([C@@H]1[C@H]([C@@H]([C@H]([C@H](O1)O)O)O)O)O}", # 0 1 2 3 @@ -194,7 +194,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): (2, 15), (3, 4), (3, 9), (3, 16), (4, 5), (4, 8), (4, 17), (5, 6), (5, 7), (5, 18), (7, 19), (8, 20), (9, 21), (10, 22), (11, 23)], {1: (6, 14, 2, 0), 2: (1, 15, 3, 10), 3: (2, 16, 9, 4), - 4: (3, 17, 5, 8), 5: (4, 18, 6, 7)}, {}), + 4: (3, 17, 5, 8), 5: (4, 18, 6, 7)}, {}, {}), # simple chirality assigment between fragments ("{[#A][#B][#C]}.{#A=O[>],#C=O[<],#B=[<]C[C@H][>]C(=O)OC}", # 0 1 2 3 @@ -204,7 +204,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): [(0, 1), (0, 2), (2, 3), (2, 4), (2, 5), (5, 6), (5, 7), (7, 8), (7, 9), (9, 10), (10, 11), (10, 12), (10, 13), (5, 14), (14, 15)], - {3: (2, 10, 4, 14)}, {}), + {3: (2, 10, 4, 14)}, {}, {}), # simple chirality assigment between fragments inv ("{[#A][#B][#C]}.{#A=O[>],#C=O[<],#B=[<]C[C@@H][>]C(=O)OC}", # 0 1 2 3 @@ -214,21 +214,21 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): [(0, 1), (0, 2), (2, 3), (2, 4), (2, 5), (5, 6), (5, 7), (7, 8), (7, 9), (9, 10), (10, 11), (10, 12), (10, 13), (5, 14), (14, 15)], - {3: (2, 10, 14, 4)}, {}), + {3: (2, 10, 14, 4)}, {}, {}), # smiple ez isomerism assigment between fragments inv ("{[#A][#B]}.{#A=CC(/F)=[$],#B=[$]=C(\F)C}", [('A', 'C C F H H H'), ('B', 'C F C H H H')], 'C C F H H H F C C H H H', [(0, 1), (1, 2), (0, 3), (0, 4), (0, 5), (1, 7), (7, 6), (7, 8), (8, 9), (8, 10), (8, 11)], - {}, {2: (2, 1, 6, 7, 'trans'), 7: (7, 6, 1, 2, 'trans')}), + {}, {2: (2, 1, 6, 7, 'trans'), 7: (7, 6, 1, 2, 'trans')}, {}), # simple ez isomerism assigment between fragments inv ("{[#A][#B]}.{#A=CC(/F)=[$],#B=[$]=C(/F)C}", [('A', 'C C F H H H'), ('B', 'C F C H H H')], 'C C F H H H F C C H H H', [(0, 1), (1, 2), (0, 3), (0, 4), (0, 5), (1, 7), (7, 6), (7, 8), (8, 9), (8, 10), (8, 11)], - {}, {2: (2, 1, 6, 7, 'cis'), 7: (7, 6, 1, 2, 'cis')}), + {}, {2: (2, 1, 6, 7, 'cis'), 7: (7, 6, 1, 2, 'cis')}, {}), # test skip virtual nodes ("{[#SP4]1.2[#SP4].3[#SP1r]1.[#TC4]23}.{#SP4=OC[$]C[$]O,#SP1r=[$]OC[$]CO}", [('SP4', 'O C C O H H H H'), ('SP4', 'O C C O H H H H'), @@ -238,9 +238,29 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): (3, 7), (8, 9), (8, 12), (9, 10), (9, 13), (10, 11), (10, 17), (10, 14), (11, 15), (16, 17), (17, 18), (17, 20), (18, 19), (18, 21), (18, 22), (19, 23)], - {},{}), + {},{}, {}), + # test weights + ("{[#SP4]1[#SP4][#SP1r]1}.{#SP4=[OH;0.5]C[$]C[$]O,#SP1r=[$]OC[$]CO}", + [('SP4', 'O C C O H H H H'), ('SP4', 'O C C O H H H H'), + ('SP1r', 'O C C O H H H H')], + 'O C C O H H H H O C C O H H H H O C C O H H H H', + [(0, 1), (0, 4), (1, 2), (1, 9), (1, 5), (2, 3), (2, 16), (2, 6), + (3, 7), (8, 9), (8, 12), (9, 10), (9, 13), (10, 11), (10, 17), + (10, 14), (11, 15), (16, 17), (17, 18), (17, 20), (18, 19), + (18, 21), (18, 22), (19, 23)], + {},{}, {0: 0.5, 4: 0.5, 8: 0.5, 12: 0.5}), + # test 2 weights + ("{[#SP4]1[#SP4][#SP1r]1}.{#SP4=[OH;0.5][C;0.1][$]C[$]O,#SP1r=[$]OC[$]CO}", + [('SP4', 'O C C O H H H H'), ('SP4', 'O C C O H H H H'), + ('SP1r', 'O C C O H H H H')], + 'O C C O H H H H O C C O H H H H O C C O H H H H', + [(0, 1), (0, 4), (1, 2), (1, 9), (1, 5), (2, 3), (2, 16), (2, 6), + (3, 7), (8, 9), (8, 12), (9, 10), (9, 13), (10, 11), (10, 17), + (10, 14), (11, 15), (16, 17), (17, 18), (17, 20), (18, 19), + (18, 21), (18, 22), (19, 23)], + {},{}, {0: 0.5, 1: 0.1, 5: 0.1, 4: 0.5, 8: 0.5, 9: 0.1, 12: 0.5, 13: 0.1}), )) -def test_all_atom_resolve_molecule(smile, ref_frags, elements, ref_edges, chiral, ez): +def test_all_atom_resolve_molecule(smile, ref_frags, elements, ref_edges, chiral, ez, weights): meta_mol, molecule = MoleculeResolver.from_string(smile).resolve() # loop and compare fragments first @@ -278,6 +298,12 @@ def _ele_match(n1, n2): if ez: ez_assigned = nx.get_node_attributes(molecule, 'ez_isomer') assert ez == ez_assigned + # check weights + if weights: + mol_weights = {node: 1 for node in ref_graph} + mol_weights.update(weights) + weights_assigned = nx.get_node_attributes(molecule, 'weight') + assert mol_weights == weights_assigned @pytest.mark.parametrize('case, cgsmiles_str, ref_string',( # case 1: here only the meta-graph is described by the From 985500ad705c268cb04abf9b8f4428fdda9b2e7f Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Mon, 14 Oct 2024 18:56:50 +0200 Subject: [PATCH 02/16] update tests --- cgsmiles/tests/test_cgsmile_parsing.py | 64 ++++++++++++++++---------- cgsmiles/tests/test_sampler.py | 1 + 2 files changed, 40 insertions(+), 25 deletions(-) diff --git a/cgsmiles/tests/test_cgsmile_parsing.py b/cgsmiles/tests/test_cgsmile_parsing.py index 77333c9..2f947ec 100644 --- a/cgsmiles/tests/test_cgsmile_parsing.py +++ b/cgsmiles/tests/test_cgsmile_parsing.py @@ -183,94 +183,108 @@ def test_read_cgsmiles(smile, nodes, edges, orders): fragnames = nx.get_node_attributes(meta_mol, 'fragname') assert nodes == list(fragnames.values()) -@pytest.mark.parametrize('big_smile, smile, bonding, rs, ez',( +@pytest.mark.parametrize('big_smile, smile, bonding, rs, ez, weights',( # smiple symmetric bonding ("[$]COC[$]", "COC", {0: ["$1"], 2: ["$1"]}, None, + None, None), # smiple symmetric bonding with more than one name ("[$1A]COC[$1A]", "COC", {0: ["$1A1"], 2: ["$1A1"]}, None, + None, None), # smiple bonding multiletter atom ("Clc[$]c[$]", "Clcc", {1: ["$1"], 2: ["$1"]}, None, + None, None), # simple symmetric but with explicit hydrogen ("[$][CH2]O[CH2][$]", "[CH2]O[CH2]", {0: ["$1"], 2: ["$1"]}, None, + None, None), # smiple symmetric bonding; multiple descript ("[$]COC[$][$1]", "COC", {0: ["$1"], 2: ["$1", "$11"]}, None, + None, None), # named different bonding descriptors ("[$1]CCCC[$2]", "CCCC", {0: ["$11"], 3: ["$21"]}, None, + None, None), # ring and bonding descriptors ("[$1]CC[$2]C1CCCCC1", "CCC1CCCCC1", {0: ["$11"], 1: ["$21"]}, None, + None, None), # bonding descript. after branch ("C(COC[$1])[$2]CCC[$3]", "C(COC)CCC", {0: ["$21"], 3: ["$11"], 6: ["$31"]}, None, + None, None), # left rigth bonding desciptors ("[>]COC[<]", "COC", {0: [">1"], 2: ["<1"]}, None, + None, None), # simple chirality in residue ("[>]C[C@](F)(B)N[<]", "C[C](F)(B)N", {0: [">1"], 4: ["<1"]}, {1: ('@', [])}, + None, None), # simple chirality inverse in residue ("[>]C[C@@](F)(B)N[<]", "C[C](F)(B)N", {0: [">1"], 4: ["<1"]}, {1: ('@@', [])}, + None, None), # \ fragment split ("[>]CC(\F)=[<]", "CC(F)", {0: [">1"], 1: ["<2"]}, None, - {2: (2, 1, '\\')}), + {2: (2, 1, '\\')}, + None), # / fragment split ("[>]CC(/F)=[<]", "CC(F)", {0: [">1"], 1: ["<2"]}, None, - {2: (2, 1, '/')}), + {2: (2, 1, '/')}, + None), # both in one fragment ("[>]CC(/F)=C(\F)C[<]", "CC(F)=C(F)C", {0: [">1"], 5: ["<1"]}, None, - {2: (2, 1, '/'), 4: (4, 3, '\\')}), + {2: (2, 1, '/'), 4: (4, 3, '\\')}, + None), )) -def test_strip_bonding_descriptors(big_smile, smile, bonding, rs, ez): - new_smile, new_bonding, rs_isomers, ez_isomers = strip_bonding_descriptors(big_smile) +def test_strip_bonding_descriptors(big_smile, smile, bonding, rs, ez, weights): + new_smile, new_bonding, rs_isomers, ez_isomers, weights = strip_bonding_descriptors(big_smile) assert new_smile == smile assert new_bonding == bonding if rs: @@ -281,50 +295,50 @@ def test_strip_bonding_descriptors(big_smile, smile, bonding, rs, ez): @pytest.mark.parametrize('fragment_str, nodes, edges',( # single fragment ("{#PEO=[$]COC[$]}", - {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3}), - (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0}), - (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3}), + {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "weight": 1}), + (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "weight": 1}), + (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "weight": 1}), )}, {"PEO": [(0, 1), (1, 2)]}), # single fragment but with explicit hydrogen in smiles ("{#PEO=[$][CH2]O[CH2][$]}", - {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2}), - (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0}), - (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2}), + {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2, "weight": 1}), + (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "weight": 1}), + (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2, "weight": 1}), )}, {"PEO": [(0, 1), (1, 2)]}), # test NH3 terminal ("{#AMM=N[$]}", - {"AMM": ((0, {"atomname": "N0", "fragname": "AMM", "bonding": ["$1"], "element": "N", "hcount": 3}), + {"AMM": ((0, {"atomname": "N0", "fragname": "AMM", "bonding": ["$1"], "element": "N", "hcount": 3, "weight": 1}), )}, {"AMM": []}), # single fragment + 1 terminal (i.e. only 1 bonding descrpt ("{#PEO=[$]COC[$],#OHter=[$][OH]}", - {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3}), - (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0}), - (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3}), + {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "weight": 1}), + (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "weight": 1}), + (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "weight": 1}), ), - "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O"}),)}, + "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "weight": 1}),)}, {"PEO": [(0, 1), (1, 2)], "OHter": []}), # single fragment + 1 terminal but multiple bond descritp. # this adjust the hydrogen count ("{#PEO=[$]COC[$][$1],#OHter=[$][OH]}", - {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3}), - (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0}), - (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1", "$11"], "element": "C", "hcount": 3}), + {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "weight": 1}), + (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "weight": 1}), + (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1", "$11"], "element": "C", "hcount": 3, "weight": 1}), ), - "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "hcount": 1}),)}, + "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "hcount": 1, "weight": 1}),)}, {"PEO": [(0, 1), (1, 2)], "OHter": []}), # single fragment + 1 terminal but multiple bond descritp. # but explicit hydrogen in the smiles string ("{#PEO=[$][CH2]O[CH2][$][$1],#OHter=[$][OH]}", - {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2}), - (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0}), - (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1", "$11"], "element": "C", "hcount": 2}), + {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2, "weight": 1}), + (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "weight": 1}), + (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1", "$11"], "element": "C", "hcount": 2, "weight": 1}), ), - "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "hcount": 1}), + "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "hcount": 1, "weight": 1}), )}, {"PEO": [(0, 1), (1, 2),], "OHter": []}), diff --git a/cgsmiles/tests/test_sampler.py b/cgsmiles/tests/test_sampler.py index e1b08cd..158178f 100644 --- a/cgsmiles/tests/test_sampler.py +++ b/cgsmiles/tests/test_sampler.py @@ -104,6 +104,7 @@ def test_add_fragment(graph_str, ref_graph = read_cgsmiles(ref_mol) nx.set_node_attributes(ref_graph, bonding, 'bonding') nx.set_node_attributes(ref_graph, fragid, 'fragid') + nx.set_node_attributes(ref_graph, 1, 'weight') atomnames = nx.get_node_attributes(ref_graph, 'fragname') nx.set_node_attributes(ref_graph, atomnames, 'atomname') nx.set_node_attributes(ref_graph, resnames, 'fragname') From fd838e181acde097ff4b21b519a234681b02d9df Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Mon, 14 Oct 2024 19:02:10 +0200 Subject: [PATCH 03/16] add tests --- cgsmiles/tests/test_cgsmile_parsing.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/cgsmiles/tests/test_cgsmile_parsing.py b/cgsmiles/tests/test_cgsmile_parsing.py index 2f947ec..3ca612d 100644 --- a/cgsmiles/tests/test_cgsmile_parsing.py +++ b/cgsmiles/tests/test_cgsmile_parsing.py @@ -191,6 +191,21 @@ def test_read_cgsmiles(smile, nodes, edges, orders): None, None, None), + # smiple symmetric bonding with weigth + ("[$]C[O;0.5]C[$]", + "C[O]C", + {0: ["$1"], 2: ["$1"]}, + None, + None, + {1: 0.5}), + # smiple symmetric bonding with weigth + # using cgsmiles string + ("[$][#TC4][#OT1;0.5][#CD1][$]", + "[#TC4][#OT1][#CD1]", + {0: ["$1"], 2: ["$1"]}, + None, + None, + {1: 0.5}), # smiple symmetric bonding with more than one name ("[$1A]COC[$1A]", "COC", @@ -284,13 +299,19 @@ def test_read_cgsmiles(smile, nodes, edges, orders): None), )) def test_strip_bonding_descriptors(big_smile, smile, bonding, rs, ez, weights): - new_smile, new_bonding, rs_isomers, ez_isomers, weights = strip_bonding_descriptors(big_smile) + new_smile, new_bonding, rs_isomers, ez_isomers, weights_out = strip_bonding_descriptors(big_smile) assert new_smile == smile assert new_bonding == bonding if rs: assert rs == rs_isomers if ez: assert ez == ez_isomers + # here we check that the weights are correctly + # set for nodes with weights; the default is + # checked in another test + if weights: + for node, weight in weights.items(): + assert weights_out[node] == weight @pytest.mark.parametrize('fragment_str, nodes, edges',( # single fragment From 4c4a12bf6308da4c5bacf67d2ad140cbb6f5f4de Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Thu, 31 Oct 2024 16:16:31 +0100 Subject: [PATCH 04/16] have hydrogen weights --- cgsmiles/read_fragments.py | 18 ++++++++--- cgsmiles/tests/test_cgsmile_parsing.py | 42 +++++++++++++++++++++----- 2 files changed, 49 insertions(+), 11 deletions(-) diff --git a/cgsmiles/read_fragments.py b/cgsmiles/read_fragments.py index fa28829..61be22e 100644 --- a/cgsmiles/read_fragments.py +++ b/cgsmiles/read_fragments.py @@ -148,6 +148,7 @@ def strip_bonding_descriptors(fragment_string): ez_isomer_atoms = {} rs_isomers = {} weights = {} + hydrogen_weights = defaultdict(list) smile = "" node_count = 0 prev_node = 0 @@ -185,7 +186,12 @@ def strip_bonding_descriptors(fragment_string): # we have weights elif peek == ';': weight, smile_iter = get_weight(smile_iter) - weights[node_count] = weight + # hydrogen atoms are implicit so we filter + # them out here + if atom[1:] == 'H': + hydrogen_weights[prev_node].append(weight) + else: + weights[node_count] = weight else: atom += peek peek = next(smile_iter) @@ -236,7 +242,7 @@ def strip_bonding_descriptors(fragment_string): bonded_node = _find_bonded_ring_node(ring_nodes, node) rs_isomers[node][1].append(bonded_node) - return smile, bonding_descrpt, rs_isomers, ez_isomer_atoms, weights + return smile, bonding_descrpt, rs_isomers, ez_isomer_atoms, weights, hydrogen_weights def fragment_iter(fragment_str, all_atom=True): """ @@ -266,13 +272,15 @@ def fragment_iter(fragment_str, all_atom=True): delim = fragment.find('=', 0) fragname = fragment[1:delim] frag_smile = fragment[delim+1:] - smile, bonding_descrpt, rs_isomers, ez_isomers, weights = strip_bonding_descriptors(frag_smile) + smile, bonding_descrpt, rs_isomers, ez_isomers, weights, h_weights = strip_bonding_descriptors(frag_smile) if smile == "H": mol_graph = nx.Graph() mol_graph.add_node(0, element="H", bonding=bonding_descrpt[0]) nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding') elif all_atom: - mol_graph = pysmiles.read_smiles(smile, reinterpret_aromatic=False, strict=False) + mol_graph = pysmiles.read_smiles(smile, + reinterpret_aromatic=False, + strict=False) nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding') nx.set_node_attributes(mol_graph, rs_isomers, 'rs_isomer') # we need to split countable node keys and the associated value @@ -280,6 +288,8 @@ def fragment_iter(fragment_str, all_atom=True): ez_isomer_class = {idx: val[-1] for idx, val in ez_isomers.items()} nx.set_node_attributes(mol_graph, ez_isomer_atoms, 'ez_isomer_atoms') nx.set_node_attributes(mol_graph, ez_isomer_class, 'ez_isomer_class') + # set the hydrogen weight attribute + nx.set_node_attributes(mol_graph, h_weights, 'hweight') # we deal with a CG resolution graph else: mol_graph = read_cgsmiles(smile) diff --git a/cgsmiles/tests/test_cgsmile_parsing.py b/cgsmiles/tests/test_cgsmile_parsing.py index 5da2572..5d0a8b5 100644 --- a/cgsmiles/tests/test_cgsmile_parsing.py +++ b/cgsmiles/tests/test_cgsmile_parsing.py @@ -262,35 +262,48 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): set_charges = nx.get_node_attributes(meta_mol, 'charge') assert set_charges == charges -@pytest.mark.parametrize('big_smile, smile, bonding, rs, ez',( +@pytest.mark.parametrize('big_smile, smile, bonding, rs, ez, weights, hweights',( # smiple symmetric bonding ("[$]COC[$]", "COC", {0: ["$1"], 2: ["$1"]}, None, None, + None, None), - # smiple symmetric bonding with weigth + # smiple symmetric bonding with weight ("[$]C[O;0.5]C[$]", "C[O]C", {0: ["$1"], 2: ["$1"]}, None, None, - {1: 0.5}), - # smiple symmetric bonding with weigth + {1: 0.5}, + None), + # smiple symmetric bonding with weight # using cgsmiles string ("[$][#TC4][#OT1;0.5][#CD1][$]", "[#TC4][#OT1][#CD1]", {0: ["$1"], 2: ["$1"]}, None, None, - {1: 0.5}), + {1: 0.5}, + None), + # smiple symmetric bonding with weight + # using open smiles and hweights + ("[$]CO[C;0.5][$]([H;0.1])[H;0.2]", + "CO[C]([H])[H]", + {0: ["$1"], 2: ["$1"]}, + None, + None, + {2: 0.5}, + {2: [0.1, 0.2]}), # smiple symmetric bonding with more than one name ("[$1A]COC[$1A]", "COC", {0: ["$1A1"], 2: ["$1A1"]}, None, None, + None, None), # smiple bonding multiletter atom ("Clc[$]c[$]", @@ -298,6 +311,7 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): {1: ["$1"], 2: ["$1"]}, None, None, + None, None), # simple symmetric but with explicit hydrogen ("[$][CH2]O[CH2][$]", @@ -305,6 +319,7 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): {0: ["$1"], 2: ["$1"]}, None, None, + None, None), # smiple symmetric bonding; multiple descript ("[$]COC[$][$1]", @@ -312,6 +327,7 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): {0: ["$1"], 2: ["$1", "$11"]}, None, None, + None, None), # named different bonding descriptors ("[$1]CCCC[$2]", @@ -319,6 +335,7 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): {0: ["$11"], 3: ["$21"]}, None, None, + None, None), # ring and bonding descriptors ("[$1]CC[$2]C1CCCCC1", @@ -326,6 +343,7 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): {0: ["$11"], 1: ["$21"]}, None, None, + None, None), # bonding descript. after branch ("C(COC[$1])[$2]CCC[$3]", @@ -333,6 +351,7 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): {0: ["$21"], 3: ["$11"], 6: ["$31"]}, None, None, + None, None), # left rigth bonding desciptors ("[>]COC[<]", @@ -340,6 +359,7 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): {0: [">1"], 2: ["<1"]}, None, None, + None, None), # simple chirality in residue ("[>]C[C@](F)(B)N[<]", @@ -347,6 +367,7 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): {0: [">1"], 4: ["<1"]}, {1: ('@', [])}, None, + None, None), # simple chirality inverse in residue ("[>]C[C@@](F)(B)N[<]", @@ -354,6 +375,7 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): {0: [">1"], 4: ["<1"]}, {1: ('@@', [])}, None, + None, None), # \ fragment split ("[>]CC(\F)=[<]", @@ -361,6 +383,7 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): {0: [">1"], 1: ["<2"]}, None, {2: (2, 1, '\\')}, + None, None), # / fragment split ("[>]CC(/F)=[<]", @@ -368,6 +391,7 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): {0: [">1"], 1: ["<2"]}, None, {2: (2, 1, '/')}, + None, None), # both in one fragment ("[>]CC(/F)=C(\F)C[<]", @@ -375,10 +399,11 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): {0: [">1"], 5: ["<1"]}, None, {2: (2, 1, '/'), 4: (4, 3, '\\')}, + None, None), )) -def test_strip_bonding_descriptors(big_smile, smile, bonding, rs, ez, weights): - new_smile, new_bonding, rs_isomers, ez_isomers, weights_out = strip_bonding_descriptors(big_smile) +def test_strip_bonding_descriptors(big_smile, smile, bonding, rs, ez, weights, hweights): + new_smile, new_bonding, rs_isomers, ez_isomers, weights_out, hweights_out = strip_bonding_descriptors(big_smile) assert new_smile == smile assert new_bonding == bonding if rs: @@ -391,6 +416,9 @@ def test_strip_bonding_descriptors(big_smile, smile, bonding, rs, ez, weights): if weights: for node, weight in weights.items(): assert weights_out[node] == weight + if hweights: + for node, weight in hweights.items(): + assert hweights_out[node] == weight @pytest.mark.parametrize('fragment_str, nodes, edges',( # single fragment From fa55b8765a9a4561fdadf7c2cb9fb6f2a5b2c1a1 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Thu, 31 Oct 2024 16:52:35 +0100 Subject: [PATCH 05/16] tests for hydrogen weights --- cgsmiles/tests/test_cgsmile_parsing.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cgsmiles/tests/test_cgsmile_parsing.py b/cgsmiles/tests/test_cgsmile_parsing.py index 5d0a8b5..dfadbb6 100644 --- a/cgsmiles/tests/test_cgsmile_parsing.py +++ b/cgsmiles/tests/test_cgsmile_parsing.py @@ -297,6 +297,14 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): None, {2: 0.5}, {2: [0.1, 0.2]}), + # H atom with weight goes first + ("[H;0.3]C[$]O[C;0.5][$]", + "[H]CO[C]", + {1: ["$1"], 3: ["$1"]}, + None, + None, + {3: 0.5}, + {1: [0.3]}), # smiple symmetric bonding with more than one name ("[$1A]COC[$1A]", "COC", From 39e2857932299fb37ed40ea066d8eaf868a805fb Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Thu, 31 Oct 2024 16:52:53 +0100 Subject: [PATCH 06/16] take care of case where h comes first --- cgsmiles/read_fragments.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cgsmiles/read_fragments.py b/cgsmiles/read_fragments.py index 61be22e..6262e8e 100644 --- a/cgsmiles/read_fragments.py +++ b/cgsmiles/read_fragments.py @@ -188,7 +188,9 @@ def strip_bonding_descriptors(fragment_string): weight, smile_iter = get_weight(smile_iter) # hydrogen atoms are implicit so we filter # them out here - if atom[1:] == 'H': + if atom[1:] == 'H' and node_count == 0: + hydrogen_weights[1].append(weight) + elif atom[1:] == 'H': hydrogen_weights[prev_node].append(weight) else: weights[node_count] = weight From 5c27942e930dd9adba619d998be2216e7269f921 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Thu, 31 Oct 2024 16:53:16 +0100 Subject: [PATCH 07/16] have hydrogen weights reconstructed --- cgsmiles/pysmiles_utils.py | 9 +++++++-- cgsmiles/tests/test_molecule_resolve.py | 10 ++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/cgsmiles/pysmiles_utils.py b/cgsmiles/pysmiles_utils.py index 419efb3..ef7cd5a 100644 --- a/cgsmiles/pysmiles_utils.py +++ b/cgsmiles/pysmiles_utils.py @@ -84,9 +84,14 @@ def rebuild_h_atoms(mol_graph, keep_bonding=False): mol_graph.nodes[node]["fragid"] = mol_graph.nodes[ref_node]["fragid"] mol_graph.nodes[node]["fragname"] = mol_graph.nodes[ref_node]["fragname"] if mol_graph.nodes[node].get("element", "*") == "H": - # make sure the weights are copied for implicit h-atoms anchor = list(mol_graph.neighbors(node))[0] - weight = mol_graph.nodes[anchor].get("weight", 1) + # the weight for the hydrogen atom was explicitly set + hweights = mol_graph.nodes[anchor].get('hweight', []) + if hweights: + weight = hweights.pop() + # make sure the weights are copied for implicit h-atoms + else: + weight = mol_graph.nodes[anchor].get("weight", 1) mol_graph.nodes[node]["weight"] = weight def annotate_ez_isomers(molecule): diff --git a/cgsmiles/tests/test_molecule_resolve.py b/cgsmiles/tests/test_molecule_resolve.py index 00880d7..6dabcd7 100644 --- a/cgsmiles/tests/test_molecule_resolve.py +++ b/cgsmiles/tests/test_molecule_resolve.py @@ -259,6 +259,16 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): (10, 14), (11, 15), (16, 17), (17, 18), (17, 20), (18, 19), (18, 21), (18, 22), (19, 23)], {},{}, {0: 0.5, 1: 0.1, 5: 0.1, 4: 0.5, 8: 0.5, 9: 0.1, 12: 0.5, 13: 0.1}), + # test 2 weights and hydrogen weights + ("{[#SP4]1[#SP4][#SP1r]1}.{#SP4=[O;0.5]([H;0.2])[C;0.1][$]C[$]O,#SP1r=[$]OC[$]CO}", + [('SP4', 'O C C O H H H H'), ('SP4', 'O C C O H H H H'), + ('SP1r', 'O C C O H H H H')], + 'O C C O H H H H O C C O H H H H O C C O H H H H', + [(0, 1), (0, 4), (1, 2), (1, 9), (1, 5), (2, 3), (2, 16), (2, 6), + (3, 7), (8, 9), (8, 12), (9, 10), (9, 13), (10, 11), (10, 17), + (10, 14), (11, 15), (16, 17), (17, 18), (17, 20), (18, 19), + (18, 21), (18, 22), (19, 23)], + {},{}, {0: 0.5, 1: 0.1, 5: 0.1, 4: 0.2, 8: 0.5, 9: 0.1, 12: 0.2, 13: 0.1}), )) def test_all_atom_resolve_molecule(smile, ref_frags, elements, ref_edges, chiral, ez, weights): meta_mol, molecule = MoleculeResolver.from_string(smile).resolve() From 49ca226dbf0210eb15cdb8e8ba20ea1623b9e929 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Fri, 8 Nov 2024 11:20:55 +0100 Subject: [PATCH 08/16] implement dialects and annotations --- cgsmiles/pysmiles_utils.py | 4 +- cgsmiles/read_cgsmiles.py | 29 ++++----- cgsmiles/read_fragments.py | 66 ++++++++------------ cgsmiles/tests/test_cgsmile_parsing.py | 81 +++++++++++++++---------- cgsmiles/tests/test_molecule_resolve.py | 19 +++--- cgsmiles/tests/test_sampler.py | 2 +- 6 files changed, 100 insertions(+), 101 deletions(-) diff --git a/cgsmiles/pysmiles_utils.py b/cgsmiles/pysmiles_utils.py index ef7cd5a..28fe0c2 100644 --- a/cgsmiles/pysmiles_utils.py +++ b/cgsmiles/pysmiles_utils.py @@ -91,8 +91,8 @@ def rebuild_h_atoms(mol_graph, keep_bonding=False): weight = hweights.pop() # make sure the weights are copied for implicit h-atoms else: - weight = mol_graph.nodes[anchor].get("weight", 1) - mol_graph.nodes[node]["weight"] = weight + weight = mol_graph.nodes[anchor].get("w", 1) + mol_graph.nodes[node]["w"] = weight def annotate_ez_isomers(molecule): """ diff --git a/cgsmiles/read_cgsmiles.py b/cgsmiles/read_cgsmiles.py index fbbbe4e..aaf52dd 100644 --- a/cgsmiles/read_cgsmiles.py +++ b/cgsmiles/read_cgsmiles.py @@ -2,6 +2,7 @@ import re import numpy as np import networkx as nx +from .dialects import parse_graph_base_node PATTERNS = {"bond_anchor": r"\[\$.*?\]", "place_holder": r"\[\#.*?\]", @@ -31,21 +32,21 @@ def _expand_branch(mol_graph, current, anchor, recipe): anchor: abc.hashable anchor to which to connect current node - recipe: list[(str, int, int)] + recipe: list[(str, int, dict, int)] list storing tuples of node names and - the number of times the node has to be added - and their bond order + the number of times the node has to be added, + a dict of attributes and the bond order Returns ------- nx.Graph """ prev_node = anchor - for bdx, (fragname, n_mon, order) in enumerate(recipe): + for bdx, (n_mon, attributes, order) in enumerate(recipe): if bdx == 0: anchor = current for _ in range(0, n_mon): - mol_graph.add_node(current, fragname=fragname) + mol_graph.add_node(current, **attributes) mol_graph.add_edge(prev_node, current, order=order) prev_node = current @@ -144,7 +145,7 @@ def read_cgsmiles(pattern): # the recipe for making the branch includes the anchor; # which is hence the first residue in the list # at this point the bond order is still 1 unless we have an expansion - recipes[branch_anchor[-1]] = [(mol_graph.nodes[prev_node]['fragname'], 1, 1)] + recipes[branch_anchor[-1]] = [(1, attributes, 1)] # here we check if the atom is followed by a cycle marker # in this case we have an open cycle and close it @@ -215,26 +216,18 @@ def read_cgsmiles(pattern): # the fragname starts at the second character and ends # one before the last according to the above pattern fragname = match.group(0)[2:-1] - # check for charge - charge = 0.0 - for sign in ["+", "-"]: - if sign in fragname: - fragname, charge = fragname.split(sign) - if len(charge) == 0: - charge = float(sign+"1") - else: - charge = float(sign+charge) + # read the annotations + attributes = parse_graph_base_node(fragname) # if this residue is part of a branch we store it in # the recipe dict together with the anchor residue # and expansion number if branching: - recipes[branch_anchor[-1]].append((fragname, n_mon, prev_bond_order)) - + recipes[branch_anchor[-1]].append((n_mon, attributes, prev_bond_order)) # new we add new residue as often as required connection = [] for _ in range(0, n_mon): - mol_graph.add_node(current, fragname=fragname, charge=charge) + mol_graph.add_node(current, **attributes) if prev_node is not None: mol_graph.add_edge(prev_node, current, order=prev_bond_order) diff --git a/cgsmiles/read_fragments.py b/cgsmiles/read_fragments.py index 6262e8e..d713477 100644 --- a/cgsmiles/read_fragments.py +++ b/cgsmiles/read_fragments.py @@ -5,6 +5,7 @@ import networkx as nx import pysmiles from .read_cgsmiles import read_cgsmiles +from .dialects import _fragment_node_parser class PeekIter(object): """ @@ -95,31 +96,6 @@ def collect_ring_number(smile_iter, token, node_count, rings): return smile_iter, token, partial_str, rings -def get_weight(smile_iter): - """ - Extracts weights given to atoms/nodes in - fragments. The iter should be advanced - up to the weight marker ;. - - Parameters - ---------- - smile_iter: class.PeekIter - - Returns - ------- - float: - the weight - PeekIter - the advanced iter object - """ - num = [] - for digit in smile_iter: - num.append(digit) - if smile_iter.peek() in [']', '@', 'H']: - break - out = float("".join(num)) - return out, smile_iter - def strip_bonding_descriptors(fragment_string): """ Processes a CGSmiles fragment string by @@ -147,7 +123,8 @@ def strip_bonding_descriptors(fragment_string): rings = defaultdict(list) ez_isomer_atoms = {} rs_isomers = {} - weights = {} + attributes = defaultdict(dict) + record_attributes = False hydrogen_weights = defaultdict(list) smile = "" node_count = 0 @@ -174,8 +151,7 @@ def strip_bonding_descriptors(fragment_string): bonding_descrpt[prev_node].append(bond_descrp + str(order)) else: atom = token - # set the default weight - weights[node_count] = 1 + attribute_str = "" while peek != ']': # deal with rs chirality if peek == '@': @@ -185,22 +161,29 @@ def strip_bonding_descriptors(fragment_string): rs_isomers[node_count] = (chiral_token, []) # we have weights elif peek == ';': - weight, smile_iter = get_weight(smile_iter) - # hydrogen atoms are implicit so we filter - # them out here - if atom[1:] == 'H' and node_count == 0: - hydrogen_weights[1].append(weight) - elif atom[1:] == 'H': - hydrogen_weights[prev_node].append(weight) - else: - weights[node_count] = weight + record_attributes = True + elif record_attributes: + attribute_str += peek else: atom += peek + peek = next(smile_iter) + record_attributes=False + # here we do some post processing cleanup + node_attributes = _fragment_node_parser(attribute_str) + attributes[node_count].update(node_attributes) + # hydrogen atoms are implicit so we filter + # them out here + if atom[1:] == 'H' and node_count == 0: + hydrogen_weights[1].append(attributes[node_count]['w']) + elif atom[1:] == 'H': + hydrogen_weights[prev_node].append(attributes[node_count]['w']) + smile = smile + atom + "]" prev_node = node_count node_count += 1 + elif token == '(': anchor = prev_node smile += token @@ -234,7 +217,7 @@ def strip_bonding_descriptors(fragment_string): current_order = None prev_node = node_count # set default weight - weights[node_count] = 1 + attributes[node_count]['w'] = 1 node_count += 1 # we need to annotate rings to the chiral isomers @@ -244,7 +227,7 @@ def strip_bonding_descriptors(fragment_string): bonded_node = _find_bonded_ring_node(ring_nodes, node) rs_isomers[node][1].append(bonded_node) - return smile, bonding_descrpt, rs_isomers, ez_isomer_atoms, weights, hydrogen_weights + return smile, bonding_descrpt, rs_isomers, ez_isomer_atoms, attributes, hydrogen_weights def fragment_iter(fragment_str, all_atom=True): """ @@ -274,7 +257,7 @@ def fragment_iter(fragment_str, all_atom=True): delim = fragment.find('=', 0) fragname = fragment[1:delim] frag_smile = fragment[delim+1:] - smile, bonding_descrpt, rs_isomers, ez_isomers, weights, h_weights = strip_bonding_descriptors(frag_smile) + smile, bonding_descrpt, rs_isomers, ez_isomers, attributes, h_weights = strip_bonding_descriptors(frag_smile) if smile == "H": mol_graph = nx.Graph() mol_graph.add_node(0, element="H", bonding=bonding_descrpt[0]) @@ -305,7 +288,8 @@ def fragment_iter(fragment_str, all_atom=True): nx.set_node_attributes(mol_graph, fragname, 'fragname') nx.set_node_attributes(mol_graph, 0, 'fragid') - nx.set_node_attributes(mol_graph, weights, 'weight') + # set other attributes + nx.set_node_attributes(mol_graph, attributes) yield fragname, mol_graph def read_fragments(fragment_str, all_atom=True, fragment_dict=None): diff --git a/cgsmiles/tests/test_cgsmile_parsing.py b/cgsmiles/tests/test_cgsmile_parsing.py index dfadbb6..2e2919f 100644 --- a/cgsmiles/tests/test_cgsmile_parsing.py +++ b/cgsmiles/tests/test_cgsmile_parsing.py @@ -11,7 +11,13 @@ [(0, 1), (1, 2)], [1, 1]), # smiple charges - ("{[#PMA+][#PEO][#PMA-0.25]}", + ("{[#PMA;+1][#PEO][#PMA;-0.25]}", + ["PMA", "PEO", "PMA"], + {0: 1.0, 1: 0.0, 2:-0.25}, + [(0, 1), (1, 2)], + [1, 1]), + # smiple charges with keyword + ("{[#PMA;c=+1][#PEO][#PMA;c=-0.25]}", ["PMA", "PEO", "PMA"], {0: 1.0, 1: 0.0, 2:-0.25}, [(0, 1), (1, 2)], @@ -256,13 +262,15 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): assert meta_mol.edges[edge]["order"] == order fragnames = nx.get_node_attributes(meta_mol, 'fragname') + print(fragnames) + print(nodes) assert nodes == list(fragnames.values()) if charges: - set_charges = nx.get_node_attributes(meta_mol, 'charge') + set_charges = nx.get_node_attributes(meta_mol, 'c') assert set_charges == charges -@pytest.mark.parametrize('big_smile, smile, bonding, rs, ez, weights, hweights',( +@pytest.mark.parametrize('big_smile, smile, bonding, rs, ez, attrs, hweights',( # smiple symmetric bonding ("[$]COC[$]", "COC", @@ -277,7 +285,7 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): {0: ["$1"], 2: ["$1"]}, None, None, - {1: 0.5}, + {'w': {0: 1, 1: 0.5, 2: 1}}, None), # smiple symmetric bonding with weight # using cgsmiles string @@ -286,8 +294,17 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): {0: ["$1"], 2: ["$1"]}, None, None, - {1: 0.5}, + {'w': {0: 1, 1: 0.5, 2: 1}}, None), + # # smiple symmetric bonding with random + # # keyword argument + # ("[$][#TC4][#OT1;r=abc][#CD1][$]", + # "[#TC4][#OT1][#CD1]", + # {0: ["$1"], 2: ["$1"]}, + # None, + # None, + # {'w': {0: 1, 1: 1, 2: 1}, 'r': {1: 'abc'}}, + # None), # smiple symmetric bonding with weight # using open smiles and hweights ("[$]CO[C;0.5][$]([H;0.1])[H;0.2]", @@ -295,7 +312,7 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): {0: ["$1"], 2: ["$1"]}, None, None, - {2: 0.5}, + {'w':{0: 1, 1: 1, 2: 0.5}}, {2: [0.1, 0.2]}), # H atom with weight goes first ("[H;0.3]C[$]O[C;0.5][$]", @@ -303,7 +320,7 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): {1: ["$1"], 3: ["$1"]}, None, None, - {3: 0.5}, + {'w': {1: 1, 2: 1, 3: 0.5}}, {1: [0.3]}), # smiple symmetric bonding with more than one name ("[$1A]COC[$1A]", @@ -410,8 +427,8 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): None, None), )) -def test_strip_bonding_descriptors(big_smile, smile, bonding, rs, ez, weights, hweights): - new_smile, new_bonding, rs_isomers, ez_isomers, weights_out, hweights_out = strip_bonding_descriptors(big_smile) +def test_strip_bonding_descriptors(big_smile, smile, bonding, rs, ez, attrs, hweights): + new_smile, new_bonding, rs_isomers, ez_isomers, attrs_out, hweights_out = strip_bonding_descriptors(big_smile) assert new_smile == smile assert new_bonding == bonding if rs: @@ -421,9 +438,11 @@ def test_strip_bonding_descriptors(big_smile, smile, bonding, rs, ez, weights, h # here we check that the weights are correctly # set for nodes with weights; the default is # checked in another test - if weights: - for node, weight in weights.items(): - assert weights_out[node] == weight + print(attrs_out) + if attrs: + for attr, node_attrs in attrs.items(): + for node, value in node_attrs.items(): + assert attrs_out[node][attr] == value if hweights: for node, weight in hweights.items(): assert hweights_out[node] == weight @@ -431,50 +450,50 @@ def test_strip_bonding_descriptors(big_smile, smile, bonding, rs, ez, weights, h @pytest.mark.parametrize('fragment_str, nodes, edges',( # single fragment ("{#PEO=[$]COC[$]}", - {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "weight": 1}), - (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "weight": 1}), - (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "weight": 1}), + {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "w": 1}), + (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "w": 1}), + (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "w": 1}), )}, {"PEO": [(0, 1), (1, 2)]}), # single fragment but with explicit hydrogen in smiles ("{#PEO=[$][CH2]O[CH2][$]}", - {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2, "weight": 1}), - (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "weight": 1}), - (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2, "weight": 1}), + {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2, "w": 1}), + (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "w": 1}), + (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2, "w": 1}), )}, {"PEO": [(0, 1), (1, 2)]}), # test NH3 terminal ("{#AMM=N[$]}", - {"AMM": ((0, {"atomname": "N0", "fragname": "AMM", "bonding": ["$1"], "element": "N", "hcount": 3, "weight": 1}), + {"AMM": ((0, {"atomname": "N0", "fragname": "AMM", "bonding": ["$1"], "element": "N", "hcount": 3, "w": 1}), )}, {"AMM": []}), # single fragment + 1 terminal (i.e. only 1 bonding descrpt ("{#PEO=[$]COC[$],#OHter=[$][OH]}", - {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "weight": 1}), - (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "weight": 1}), - (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "weight": 1}), + {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "w": 1}), + (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "w": 1}), + (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "w": 1}), ), - "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "weight": 1}),)}, + "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "w": 1}),)}, {"PEO": [(0, 1), (1, 2)], "OHter": []}), # single fragment + 1 terminal but multiple bond descritp. # this adjust the hydrogen count ("{#PEO=[$]COC[$][$1],#OHter=[$][OH]}", - {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "weight": 1}), - (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "weight": 1}), - (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1", "$11"], "element": "C", "hcount": 3, "weight": 1}), + {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "w": 1}), + (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "w": 1}), + (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1", "$11"], "element": "C", "hcount": 3, "w": 1}), ), - "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "hcount": 1, "weight": 1}),)}, + "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "hcount": 1, "w": 1}),)}, {"PEO": [(0, 1), (1, 2)], "OHter": []}), # single fragment + 1 terminal but multiple bond descritp. # but explicit hydrogen in the smiles string ("{#PEO=[$][CH2]O[CH2][$][$1],#OHter=[$][OH]}", - {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2, "weight": 1}), - (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "weight": 1}), - (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1", "$11"], "element": "C", "hcount": 2, "weight": 1}), + {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2, "w": 1}), + (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "w": 1}), + (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1", "$11"], "element": "C", "hcount": 2, "w": 1}), ), - "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "hcount": 1, "weight": 1}), + "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "hcount": 1, "w": 1}), )}, {"PEO": [(0, 1), (1, 2),], "OHter": []}), diff --git a/cgsmiles/tests/test_molecule_resolve.py b/cgsmiles/tests/test_molecule_resolve.py index 6dabcd7..65656a3 100644 --- a/cgsmiles/tests/test_molecule_resolve.py +++ b/cgsmiles/tests/test_molecule_resolve.py @@ -248,7 +248,9 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): (3, 7), (8, 9), (8, 12), (9, 10), (9, 13), (10, 11), (10, 17), (10, 14), (11, 15), (16, 17), (17, 18), (17, 20), (18, 19), (18, 21), (18, 22), (19, 23)], - {},{}, {0: 0.5, 4: 0.5, 8: 0.5, 12: 0.5}), + {},{}, {0: 0.5, 1: 1, 2: 1, 3: 1, 4: 0.5, 5: 1, 6: 1, 7: 1, 8: 0.5, + 9: 1, 10: 1, 11: 1, 12: 0.5, 13: 1, 14: 1, 15: 1, 16: 1, 17: 1, + 18: 1, 19: 1, 20: 1, 21: 1, 22: 1, 23: 1}), # test 2 weights ("{[#SP4]1[#SP4][#SP1r]1}.{#SP4=[OH;0.5][C;0.1][$]C[$]O,#SP1r=[$]OC[$]CO}", [('SP4', 'O C C O H H H H'), ('SP4', 'O C C O H H H H'), @@ -312,7 +314,7 @@ def _ele_match(n1, n2): if weights: mol_weights = {node: 1 for node in ref_graph} mol_weights.update(weights) - weights_assigned = nx.get_node_attributes(molecule, 'weight') + weights_assigned = nx.get_node_attributes(molecule, 'w') assert mol_weights == weights_assigned @pytest.mark.parametrize('case, cgsmiles_str, ref_string',( @@ -342,13 +344,14 @@ def _atomname_match(n1, n2): return n1["fragname"] == n2["atomname"] assert nx.is_isomorphic(ref_graph, molecule, node_match=_atomname_match) -@pytest.mark.parametrize('cgsmiles_str, error_message',( -(("{[#A][#B]}.{#A=CC[$]}", "Found node #B but no corresponding fragment."), - ("{[#A][#B]1}.{#A=CC[$],#B=OC[$]}", "You have a dangling ring index."), - ("{[#A]1[#B]1}{#A=CC[$],#B=OC[$]}", "You define two edges between the same node. Use bond order symbols instead."), +@pytest.mark.parametrize('cgsmiles_str, error_message, error_type',( +(("{[#A][#B]}.{#A=CC[$]}", "Found node #B but no corresponding fragment.", SyntaxError), + ("{[#A][#B]1}.{#A=CC[$],#B=OC[$]}", "You have a dangling ring index.", SyntaxError), + ("{[#A]1[#B]1}{#A=CC[$],#B=OC[$]}", "You define two edges between the same node. Use bond order symbols instead.", SyntaxError), + ("{[#A;w=abc][#B]}.{#A=CC[$],#B=OC[$]}", "Argument 'w' must be of type float.", TypeError), ))) -def test_syntax_errors(cgsmiles_str, error_message): - with pytest.raises(SyntaxError) as e_message: +def test_syntax_errors(cgsmiles_str, error_message, error_type): + with pytest.raises(error_type) as e_message: resolver = MoleculeResolver.from_string(cgsmiles_str) cg_mol, aa_mol = resolver.resolve() assert e_message == error_message diff --git a/cgsmiles/tests/test_sampler.py b/cgsmiles/tests/test_sampler.py index 158178f..8ccfd01 100644 --- a/cgsmiles/tests/test_sampler.py +++ b/cgsmiles/tests/test_sampler.py @@ -104,7 +104,7 @@ def test_add_fragment(graph_str, ref_graph = read_cgsmiles(ref_mol) nx.set_node_attributes(ref_graph, bonding, 'bonding') nx.set_node_attributes(ref_graph, fragid, 'fragid') - nx.set_node_attributes(ref_graph, 1, 'weight') + nx.set_node_attributes(ref_graph, 1, 'w') atomnames = nx.get_node_attributes(ref_graph, 'fragname') nx.set_node_attributes(ref_graph, atomnames, 'atomname') nx.set_node_attributes(ref_graph, resnames, 'fragname') From df6c8d89fecbfc199cf2cbde5b5170a1cf46b5d0 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Fri, 8 Nov 2024 11:22:57 +0100 Subject: [PATCH 09/16] implement dialects and annotations --- cgsmiles/dialects.py | 78 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 cgsmiles/dialects.py diff --git a/cgsmiles/dialects.py b/cgsmiles/dialects.py new file mode 100644 index 0000000..31ddb14 --- /dev/null +++ b/cgsmiles/dialects.py @@ -0,0 +1,78 @@ +from inspect import signature, Signature, Parameter +from functools import partial + +def check_and_cast_types(bound_args, signature): + for name, value in bound_args.arguments.items(): + param = signature.parameters.get(name) + # Check if a type annotation is present + if param and param.annotation != Parameter.empty: + expected_type = param.annotation + + # Attempt type casting if the value is not of the expected type + if not isinstance(value, expected_type): + try: + bound_args.arguments[name] = expected_type(value) + except (TypeError, ValueError): + raise TypeError(f"Argument '{name}' must be of type {expected_type.__name__}") + return bound_args + +def _parse_node(string_iteratable, dialect_signature): + """ + This base function parsers a CGSmiles node. It must be + decorated with a signature which defines the dialect. + The dialect sets expected labels and default values of + a given node. + """ + args_found = [] + kwargs_found = {} + if len(string_iteratable) > 0: + elements = string_iteratable.split(';') + for entry in elements: + key_value = entry.split('=') + if len(key_value) == 1: + args_found.append(key_value[0]) + else: + kwargs_found[key_value[0]] = key_value[1] + + applied_labels = dialect_signature.bind(*args_found, + **kwargs_found) + applied_labels = check_and_cast_types(applied_labels, + dialect_signature) + applied_labels.apply_defaults() + return applied_labels.arguments + +def create_dialect(default_attributes): + """ + Creates a signature of default attributes. + Note that the order of the entries in the dict + determines the order of the args accapted. + """ + parameters = [] + for argname, default_value in default_attributes.items(): + arg_type = type(default_value) + parameters.append(Parameter(argname, + Parameter.POSITIONAL_OR_KEYWORD, + default=default_value, + annotation=arg_type)) + sig = Signature(parameters) + return sig + +########################################################## +# KNOWN DIALECTS # +########################################################## +# this one is for global use +# it is the base CGSmiles dialect +GRAPH_BASE = create_dialect({"fragname": "NaN", + "c": 0.0, + "w": 1.0}) +parse_graph_base_node = partial(_parse_node, dialect_signature=GRAPH_BASE) +# this one is an internal fukery until the pysmiles +# base parser is available +# it just strips the kwargs from fragments before +# they go to the respective parser +# in case of cgsmiles fragments it is a bit doing +# double the work +fragment_base = create_dialect({"w": 1.0}) +print(GRAPH_BASE) +print(fragment_base) +_fragment_node_parser = partial(_parse_node, dialect_signature=fragment_base) From 7f43dcf63c163bb719a5f56a86019a8227721021 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Thu, 14 Nov 2024 17:38:26 +0100 Subject: [PATCH 10/16] address some comments --- cgsmiles/dialects.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/cgsmiles/dialects.py b/cgsmiles/dialects.py index 31ddb14..c5b3820 100644 --- a/cgsmiles/dialects.py +++ b/cgsmiles/dialects.py @@ -16,7 +16,10 @@ def check_and_cast_types(bound_args, signature): raise TypeError(f"Argument '{name}' must be of type {expected_type.__name__}") return bound_args -def _parse_node(string_iteratable, dialect_signature): +def _parse_node(string_iterable, + dialect_signature, + annotation_sep_token=';', + annotation_assign_token='='): """ This base function parsers a CGSmiles node. It must be decorated with a signature which defines the dialect. @@ -25,10 +28,10 @@ def _parse_node(string_iteratable, dialect_signature): """ args_found = [] kwargs_found = {} - if len(string_iteratable) > 0: - elements = string_iteratable.split(';') + if len(string_iterable) > 0: + elements = string_iterable.split(annotation_sep_token) for entry in elements: - key_value = entry.split('=') + key_value = entry.split(annotation_assign_token) if len(key_value) == 1: args_found.append(key_value[0]) else: @@ -43,9 +46,9 @@ def _parse_node(string_iteratable, dialect_signature): def create_dialect(default_attributes): """ - Creates a signature of default attributes. + Creates a signature of default annotations. Note that the order of the entries in the dict - determines the order of the args accapted. + determines the order of the args accepted. """ parameters = [] for argname, default_value in default_attributes.items(): @@ -73,6 +76,4 @@ def create_dialect(default_attributes): # in case of cgsmiles fragments it is a bit doing # double the work fragment_base = create_dialect({"w": 1.0}) -print(GRAPH_BASE) -print(fragment_base) _fragment_node_parser = partial(_parse_node, dialect_signature=fragment_base) From 8196158b8201948824b855b1c7d7b2118b3a9055 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Sun, 17 Nov 2024 19:16:53 +0100 Subject: [PATCH 11/16] fix bugs regarding kwargs; raise more verbose errors --- cgsmiles/dialects.py | 95 ++++++++++++++++++++------ cgsmiles/read_fragments.py | 7 +- cgsmiles/tests/test_cgsmile_parsing.py | 8 +++ cgsmiles/tests/test_write_cgsmiles.py | 6 ++ 4 files changed, 93 insertions(+), 23 deletions(-) diff --git a/cgsmiles/dialects.py b/cgsmiles/dialects.py index c5b3820..1ecb6fd 100644 --- a/cgsmiles/dialects.py +++ b/cgsmiles/dialects.py @@ -7,44 +7,94 @@ def check_and_cast_types(bound_args, signature): # Check if a type annotation is present if param and param.annotation != Parameter.empty: expected_type = param.annotation - - # Attempt type casting if the value is not of the expected type - if not isinstance(value, expected_type): - try: - bound_args.arguments[name] = expected_type(value) - except (TypeError, ValueError): - raise TypeError(f"Argument '{name}' must be of type {expected_type.__name__}") + # Attempt type casting if the value is not of the expected type + if not isinstance(value, expected_type): + try: + bound_args.arguments[name] = expected_type(value) + except (TypeError, ValueError): + raise TypeError(f"Argument '{name}' must be of type {expected_type.__name__}") return bound_args -def _parse_node(string_iterable, - dialect_signature, - annotation_sep_token=';', - annotation_assign_token='='): +def _parse_dialect_string(string_iterable, + dialect_signature, + annotation_sep_token=';', + annotation_assign_token='='): """ - This base function parsers a CGSmiles node. It must be - decorated with a signature which defines the dialect. - The dialect sets expected labels and default values of - a given node. + This base function parsers a string that describes key value pairs + in having a pattern of: + + keyvaluekey ... + + Default values, non-keyword agruments and types are defined using the + dialect signature object. If args are defined the key and assignment + token may be omitted. + + Neither the `annotation_sep_token` nor the `annotation_assign_token` + can be part of key or value. A SyntaxError is raised in this case. + + Parameters + ---------- + string_iterable: iter + the string or iter object that contains the string + dialect_signature: cls.inspec.Signature + a signature defineing args, kwargs, default values + and types + annotation_sep_token: str + character used to seperate key value pairs + annotation_assign_token: str + character used to assign a key from a value + + Returns + ------- + dict + dict of key value paris + + Raises + ------ + SyntaxError + an error is raised if the signature does not match or + too many annotation_assign_token are given """ args_found = [] kwargs_found = {} if len(string_iterable) > 0: elements = string_iterable.split(annotation_sep_token) for entry in elements: + if entry.count('=') > 1: + # this takes care of too many '=' chacaters + msg = (f"Your annotation {entry} contains too many " + f"{annotation_assign_token} charachters. Only" + "chacracter per key value pair is allowed") + raise SyntaxError(msg) key_value = entry.split(annotation_assign_token) + if len(key_value) == 1: args_found.append(key_value[0]) else: kwargs_found[key_value[0]] = key_value[1] - applied_labels = dialect_signature.bind(*args_found, - **kwargs_found) + try: + applied_labels = dialect_signature.bind(*args_found, + **kwargs_found) + except TypeError as emsg: + print(emsg) + msg = ("You have too many positional arguments or " + f"{annotation_sep_token} as part of key value " + "pairs which is not allowed.") + raise SyntaxError(msg) + applied_labels = check_and_cast_types(applied_labels, dialect_signature) applied_labels.apply_defaults() - return applied_labels.arguments + # if there are kwargs we need to put them into + # output dict + out_args = {} + out_args.update(applied_labels.arguments['kwargs']) + del applied_labels.arguments['kwargs'] + out_args.update(applied_labels.arguments) + return out_args -def create_dialect(default_attributes): +def create_dialect(default_attributes, accept_kwargs=True): """ Creates a signature of default annotations. Note that the order of the entries in the dict @@ -57,6 +107,9 @@ def create_dialect(default_attributes): Parameter.POSITIONAL_OR_KEYWORD, default=default_value, annotation=arg_type)) + if accept_kwargs: + parameters.append(Parameter('kwargs', + kind=Parameter.VAR_KEYWORD)) sig = Signature(parameters) return sig @@ -68,7 +121,7 @@ def create_dialect(default_attributes): GRAPH_BASE = create_dialect({"fragname": "NaN", "c": 0.0, "w": 1.0}) -parse_graph_base_node = partial(_parse_node, dialect_signature=GRAPH_BASE) +parse_graph_base_node = partial(_parse_dialect_string, dialect_signature=GRAPH_BASE) # this one is an internal fukery until the pysmiles # base parser is available # it just strips the kwargs from fragments before @@ -76,4 +129,4 @@ def create_dialect(default_attributes): # in case of cgsmiles fragments it is a bit doing # double the work fragment_base = create_dialect({"w": 1.0}) -_fragment_node_parser = partial(_parse_node, dialect_signature=fragment_base) +_fragment_node_parser = partial(_parse_dialect_string, dialect_signature=fragment_base) diff --git a/cgsmiles/read_fragments.py b/cgsmiles/read_fragments.py index d713477..779e7a7 100644 --- a/cgsmiles/read_fragments.py +++ b/cgsmiles/read_fragments.py @@ -1,12 +1,16 @@ """ Functions for reading the fragment list. """ +import logging from collections import defaultdict import networkx as nx import pysmiles from .read_cgsmiles import read_cgsmiles from .dialects import _fragment_node_parser +logger = logging.getLogger('pysmiles') +logger.setLevel(level=logging.ERROR) + class PeekIter(object): """ Custom iter that allows looking ahead, without @@ -160,13 +164,12 @@ def strip_bonding_descriptors(fragment_string): chiral_token = '@' + next(smile_iter) rs_isomers[node_count] = (chiral_token, []) # we have weights - elif peek == ';': + elif peek == ';' and not record_attributes: record_attributes = True elif record_attributes: attribute_str += peek else: atom += peek - peek = next(smile_iter) record_attributes=False diff --git a/cgsmiles/tests/test_cgsmile_parsing.py b/cgsmiles/tests/test_cgsmile_parsing.py index 2e2919f..d2391de 100644 --- a/cgsmiles/tests/test_cgsmile_parsing.py +++ b/cgsmiles/tests/test_cgsmile_parsing.py @@ -287,6 +287,14 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): None, {'w': {0: 1, 1: 0.5, 2: 1}}, None), + # smiple kwarg not part of the defaults + ("[$]C[O;q=4;p=s][C;q=3;p=l][$]", + "C[O][C]", + {0: ["$1"], 2: ["$1"]}, + None, + None, + {'q': {1: '4', 2: '3'}, 'p': {1: 's', 2: 'l'}}, + None), # smiple symmetric bonding with weight # using cgsmiles string ("[$][#TC4][#OT1;0.5][#CD1][$]", diff --git a/cgsmiles/tests/test_write_cgsmiles.py b/cgsmiles/tests/test_write_cgsmiles.py index 8bfae93..836c3bf 100644 --- a/cgsmiles/tests/test_write_cgsmiles.py +++ b/cgsmiles/tests/test_write_cgsmiles.py @@ -22,8 +22,13 @@ )) def test_write_fragments(input_string): frag_dict = read_fragments(input_string) + for g in frag_dict.values(): + print(g.nodes(data=True)) out_string = write_cgsmiles_fragments(frag_dict, smiles_format=True) frag_dict_out = read_fragments(out_string) + for g in frag_dict_out.values(): + print(g.nodes(data=True)) + print(out_string) assert set(frag_dict_out) == set(frag_dict) for fragname in frag_dict: assertEqualGraphs(frag_dict_out[fragname], frag_dict[fragname]) @@ -58,6 +63,7 @@ def test_write_cgsmiles(input_string): fragment_dicts = resolver.fragment_dicts molecule = resolver.molecule output_string = write_cgsmiles(molecule, fragment_dicts) + print(output_string) out_resolver = MoleculeResolver.from_string(output_string) out_mol = out_resolver.molecule assertEqualGraphs(molecule, out_mol) From 351c15c0183f7cb177548af94f903eb63dd00196 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Sun, 17 Nov 2024 19:21:20 +0100 Subject: [PATCH 12/16] test more verbose errors --- cgsmiles/tests/test_molecule_resolve.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cgsmiles/tests/test_molecule_resolve.py b/cgsmiles/tests/test_molecule_resolve.py index f4569ba..4e62d6f 100644 --- a/cgsmiles/tests/test_molecule_resolve.py +++ b/cgsmiles/tests/test_molecule_resolve.py @@ -355,6 +355,8 @@ def _atomname_match(n1, n2): ("{[#A][#B]1}.{#A=CC[$],#B=OC[$]}", "You have a dangling ring index.", SyntaxError), ("{[#A]1[#B]1}{#A=CC[$],#B=OC[$]}", "You define two edges between the same node. Use bond order symbols instead.", SyntaxError), ("{[#A;w=abc][#B]}.{#A=CC[$],#B=OC[$]}", "Argument 'w' must be of type float.", TypeError), + ("{[#A;w=ab=c][#B]}.{#A=CC[$],#B=OC[$]}", "Your annotation w=ab=c contains too many = charachters. Only one chacracter per key value pair is allowed", SyntaxError), + ("{[#A;w=1,c=1,q=a;d][#B]}.{#A=CC[$],#B=OC[$]}", "You have too many positional arguments or ; as part of key value pairs which is not allowed.", SyntaxError), ))) def test_syntax_errors(cgsmiles_str, error_message, error_type): with pytest.raises(error_type) as e_message: From 73fc426c7e776b608387e4f079eaee819f069a6a Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Sun, 17 Nov 2024 19:28:28 +0100 Subject: [PATCH 13/16] rename base dialect to default dialect --- cgsmiles/dialects.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cgsmiles/dialects.py b/cgsmiles/dialects.py index 1ecb6fd..91c8c71 100644 --- a/cgsmiles/dialects.py +++ b/cgsmiles/dialects.py @@ -118,10 +118,10 @@ def create_dialect(default_attributes, accept_kwargs=True): ########################################################## # this one is for global use # it is the base CGSmiles dialect -GRAPH_BASE = create_dialect({"fragname": "NaN", - "c": 0.0, - "w": 1.0}) -parse_graph_base_node = partial(_parse_dialect_string, dialect_signature=GRAPH_BASE) +CGSMILES_DEFAULT_DIALECT = create_dialect({"fragname": "NaN", + "c": 0.0, + "w": 1.0}) +parse_graph_base_node = partial(_parse_dialect_string, dialect_signature=CGSMILES_DEFAULT_DIALECT) # this one is an internal fukery until the pysmiles # base parser is available # it just strips the kwargs from fragments before From 91f77aa943a2123d1b6c13884a2745d06fc08362 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Mon, 18 Nov 2024 12:03:33 +0100 Subject: [PATCH 14/16] enable explicit hatoms --- cgsmiles/cgsmiles_utils.py | 33 ++++++ cgsmiles/dialects.py | 7 +- cgsmiles/graph_utils.py | 1 - cgsmiles/pysmiles_utils.py | 137 +++++++++++++++++++----- cgsmiles/read_fragments.py | 57 +++------- cgsmiles/tests/test_cgsmile_parsing.py | 55 +++------- cgsmiles/tests/test_molecule_resolve.py | 19 ++-- 7 files changed, 189 insertions(+), 120 deletions(-) diff --git a/cgsmiles/cgsmiles_utils.py b/cgsmiles/cgsmiles_utils.py index ef723da..6863fe0 100644 --- a/cgsmiles/cgsmiles_utils.py +++ b/cgsmiles/cgsmiles_utils.py @@ -1,5 +1,6 @@ from collections import defaultdict import networkx as nx +from .read_cgsmiles import read_cgsmiles def find_complementary_bonding_descriptor(bonding_descriptor, ellegible_descriptors=None): """ @@ -64,3 +65,35 @@ def find_open_bonds(molecule, target_nodes=None): for bonding_types in bonding_types: open_bonds_by_descriptor[bonding_types].append(node) return open_bonds_by_descriptor + +def read_fragment_cgsmiles(cgsmiles_str, + fragname, + bonding_descrpt={}, + attributes={}): + """ + Read a smiles_str corresponding to a CGSmiles fragment and + annotate bonding descriptors, isomers, as well as any other + attributes. + + Parameters + ---------- + smiles_str: str + string in CGSmiles format + fragname: str + the name of the fragment + attributes: dict + + Returns + ------- + nx.Graph + the graph of the molecular fragment + """ + mol_graph = read_cgsmiles(cgsmiles_str) + fragnames = nx.get_node_attributes(mol_graph, 'fragname') + nx.set_node_attributes(mol_graph, fragnames, 'atomname') + nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding') + nx.set_node_attributes(mol_graph, fragname, 'fragname') + nx.set_node_attributes(mol_graph, 0, 'fragid') + nx.set_node_attributes(mol_graph, 1, 'w') + nx.set_node_attributes(mol_graph, attributes) + return mol_graph diff --git a/cgsmiles/dialects.py b/cgsmiles/dialects.py index 91c8c71..36b3a4b 100644 --- a/cgsmiles/dialects.py +++ b/cgsmiles/dialects.py @@ -89,8 +89,9 @@ def _parse_dialect_string(string_iterable, # if there are kwargs we need to put them into # output dict out_args = {} - out_args.update(applied_labels.arguments['kwargs']) - del applied_labels.arguments['kwargs'] + if 'kwargs' in applied_labels.arguments: + out_args.update(applied_labels.arguments['kwargs']) + del applied_labels.arguments['kwargs'] out_args.update(applied_labels.arguments) return out_args @@ -128,5 +129,5 @@ def create_dialect(default_attributes, accept_kwargs=True): # they go to the respective parser # in case of cgsmiles fragments it is a bit doing # double the work -fragment_base = create_dialect({"w": 1.0}) +fragment_base = create_dialect({"w": 1.0}, accept_kwargs=True) _fragment_node_parser = partial(_parse_dialect_string, dialect_signature=fragment_base) diff --git a/cgsmiles/graph_utils.py b/cgsmiles/graph_utils.py index 64c384f..6a39ea2 100644 --- a/cgsmiles/graph_utils.py +++ b/cgsmiles/graph_utils.py @@ -146,7 +146,6 @@ def annotate_fragments(meta_graph, molecule): return meta_graph - def set_atom_names_atomistic(molecule, meta_graph=None): """ Set atomnames according to commonly used convention diff --git a/cgsmiles/pysmiles_utils.py b/cgsmiles/pysmiles_utils.py index 9b66e6d..61ace56 100644 --- a/cgsmiles/pysmiles_utils.py +++ b/cgsmiles/pysmiles_utils.py @@ -32,7 +32,7 @@ def compute_mass(input_molecule): mass += pysmiles.PTE[element]['AtomicMass'] return mass -def rebuild_h_atoms(mol_graph, keep_bonding=False): +def rebuild_h_atoms(mol_graph, copy_attrs=['fragid', 'fragname', 'w']): """ Helper function which add hydrogen atoms to the molecule graph. @@ -48,21 +48,20 @@ def rebuild_h_atoms(mol_graph, keep_bonding=False): The molecule graph is updated in place with the hydrogen atoms that are missing. - Using the keep_bonding argument the hydrogen count is reduced - by the number of bonding descriptors. In this way hydrogen - atoms can also be added to fragments only. + The `copy_attrs` argument defines a list of attributes to copy + to the newly added hydrogen atoms. In case the hydrogen atoms + are their own fragments attributes are not copied. If an attribute + is already assigned, because the hydrogen atom was explicit that + attribute is not replaced. Parameters ---------- mol_graph: :class:`nx.Graph` graph describing the full molecule without hydrogen atoms + copy_attrs: list[abc.hashable] + a list of attributes to copy from the parent node to the + hydrogen atom """ - for node in mol_graph.nodes: - - if mol_graph.nodes[node].get('bonding', False) and \ - mol_graph.nodes[node].get('element', '*') == "H": - mol_graph.nodes[node]['single_h_frag'] = True - try: pysmiles.smiles_helper.correct_aromatic_rings(mol_graph, strict=True) except SyntaxError as pysmiles_err: @@ -79,22 +78,14 @@ def rebuild_h_atoms(mol_graph, keep_bonding=False): pysmiles.smiles_helper.fill_valence(mol_graph, respect_hcount=False) pysmiles.smiles_helper.add_explicit_hydrogens(mol_graph) - for node in mol_graph.nodes: - if mol_graph.nodes[node].get("element", "*") == "H" and\ - not mol_graph.nodes[node].get("single_h_frag", False): - ref_node = next(mol_graph.neighbors(node)) - mol_graph.nodes[node]["fragid"] = mol_graph.nodes[ref_node]["fragid"] - mol_graph.nodes[node]["fragname"] = mol_graph.nodes[ref_node]["fragname"] - if mol_graph.nodes[node].get("element", "*") == "H": - anchor = list(mol_graph.neighbors(node))[0] - # the weight for the hydrogen atom was explicitly set - hweights = mol_graph.nodes[anchor].get('hweight', []) - if hweights: - weight = hweights.pop() - # make sure the weights are copied for implicit h-atoms - else: - weight = mol_graph.nodes[anchor].get("w", 1) - mol_graph.nodes[node]["w"] = weight + for node, element in mol_graph.nodes(data='element'): + if element == "H" and not mol_graph.nodes[node].get("single_h_frag", False): + anchor = next(mol_graph.neighbors(node)) + for attr in copy_attrs: + if attr in mol_graph.nodes[node]: + continue + value = mol_graph.nodes[anchor][attr] + mol_graph.nodes[node][attr] = value def annotate_ez_isomers(molecule): """ @@ -177,3 +168,97 @@ def mark_chiral_atoms(molecule): neighbours = [neighbours[0], neighbours[1], neighbours[3], neighbours[2]] molecule.nodes[node]['rs_isomer'] = tuple(neighbours) + +def read_fragment_smiles(smiles_str, + fragname, + bonding_descrpt={}, + rs_isomers={}, + ez_isomers={}, + attributes={}): + """ + Read a smiles_str corresponding to a CGSmiles fragment and + annotate bonding descriptors, isomers, as well as any other + attributes. + + This function also sets default attributes as follows: + + - fragname to `fragname` + - fragid to 0 + - w to 1 + + Parameters + ---------- + smiles_str: str + string in OpenSMILES format + fragname: str + the name of the fragment + rs_isomers: dict + ez_isomers: dict + attributes: dict + + Returns + ------- + nx.Graph + the graph of the molecular fragment + """ + if smiles_str == 'H': + LOGGER.warning("You define an H fragment, which is not valid SMILES. We'll make it [H].") + smiles_str = '[H]' + + mol_graph = pysmiles.read_smiles(smiles_str, + explicit_hydrogen=True, + reinterpret_aromatic=False, + strict=False) + # set some default values + nx.set_node_attributes(mol_graph, fragname, 'fragname') + nx.set_node_attributes(mol_graph, 0, 'fragid') + nx.set_node_attributes(mol_graph, 1, 'w') + + # we add all bonding descriptors to the molecule + nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding') + + # set other attributes + nx.set_node_attributes(mol_graph, attributes) + + # set the default atomnames consiting of the element and index + atomnames = {node[0]: node[1]['element']+str(node[0]) for node in mol_graph.nodes(data=True)} + nx.set_node_attributes(mol_graph, atomnames, 'atomname') + + # we have just a single atom so no need for any annotations + if len(mol_graph) == 1: + # we set the hcount for all non-hydrogen elements + if mol_graph.nodes[0]['element'] != 'H': + mol_graph.nodes[0]['hcount'] = 0 + # we tag all single h-atoms + else: + mol_graph.nodes[0]['single_h_frag'] = True + return mol_graph + + # we need to remove hydrogen atoms except when they are having + # attributes; in this case we need to keep them + hatoms = set([n for n, e in mol_graph.nodes(data='element') if e == 'H']) + hatoms_to_keep = set(attributes.keys()) & hatoms + + # temp fix until pysmiles util is imporved + # we set the element to z so they are ignored when pysmiles removes hatoms + nx.set_node_attributes(mol_graph, + dict(zip(hatoms_to_keep, len(hatoms_to_keep)*'z')), + 'element') + + pysmiles.remove_explicit_hydrogens(mol_graph) + + # now we reset the hatoms + nx.set_node_attributes(mol_graph, + dict(zip(hatoms_to_keep, len(hatoms_to_keep)*'H')), + 'element') + + # annotate rs isomers + nx.set_node_attributes(mol_graph, rs_isomers, 'rs_isomer') + + # we need to split countable node keys and the associated value + ez_isomer_atoms = {idx: val[:-1] for idx, val in ez_isomers.items()} + ez_isomer_class = {idx: val[-1] for idx, val in ez_isomers.items()} + nx.set_node_attributes(mol_graph, ez_isomer_atoms, 'ez_isomer_atoms') + nx.set_node_attributes(mol_graph, ez_isomer_class, 'ez_isomer_class') + + return mol_graph diff --git a/cgsmiles/read_fragments.py b/cgsmiles/read_fragments.py index 779e7a7..16e782a 100644 --- a/cgsmiles/read_fragments.py +++ b/cgsmiles/read_fragments.py @@ -7,6 +7,8 @@ import pysmiles from .read_cgsmiles import read_cgsmiles from .dialects import _fragment_node_parser +from .pysmiles_utils import read_fragment_smiles +from .cgsmiles_utils import read_fragment_cgsmiles logger = logging.getLogger('pysmiles') logger.setLevel(level=logging.ERROR) @@ -129,7 +131,6 @@ def strip_bonding_descriptors(fragment_string): rs_isomers = {} attributes = defaultdict(dict) record_attributes = False - hydrogen_weights = defaultdict(list) smile = "" node_count = 0 prev_node = 0 @@ -176,12 +177,6 @@ def strip_bonding_descriptors(fragment_string): # here we do some post processing cleanup node_attributes = _fragment_node_parser(attribute_str) attributes[node_count].update(node_attributes) - # hydrogen atoms are implicit so we filter - # them out here - if atom[1:] == 'H' and node_count == 0: - hydrogen_weights[1].append(attributes[node_count]['w']) - elif atom[1:] == 'H': - hydrogen_weights[prev_node].append(attributes[node_count]['w']) smile = smile + atom + "]" prev_node = node_count @@ -219,8 +214,6 @@ def strip_bonding_descriptors(fragment_string): smile += token current_order = None prev_node = node_count - # set default weight - attributes[node_count]['w'] = 1 node_count += 1 # we need to annotate rings to the chiral isomers @@ -230,7 +223,7 @@ def strip_bonding_descriptors(fragment_string): bonded_node = _find_bonded_ring_node(ring_nodes, node) rs_isomers[node][1].append(bonded_node) - return smile, bonding_descrpt, rs_isomers, ez_isomer_atoms, attributes, hydrogen_weights + return smile, bonding_descrpt, rs_isomers, ez_isomer_atoms, attributes def fragment_iter(fragment_str, all_atom=True): """ @@ -260,39 +253,21 @@ def fragment_iter(fragment_str, all_atom=True): delim = fragment.find('=', 0) fragname = fragment[1:delim] frag_smile = fragment[delim+1:] - smile, bonding_descrpt, rs_isomers, ez_isomers, attributes, h_weights = strip_bonding_descriptors(frag_smile) - if smile == "H": - mol_graph = nx.Graph() - mol_graph.add_node(0, element="H", bonding=bonding_descrpt[0]) - nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding') - elif all_atom: - mol_graph = pysmiles.read_smiles(smile, - reinterpret_aromatic=False, - strict=False) - nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding') - nx.set_node_attributes(mol_graph, rs_isomers, 'rs_isomer') - # we need to split countable node keys and the associated value - ez_isomer_atoms = {idx: val[:-1] for idx, val in ez_isomers.items()} - ez_isomer_class = {idx: val[-1] for idx, val in ez_isomers.items()} - nx.set_node_attributes(mol_graph, ez_isomer_atoms, 'ez_isomer_atoms') - nx.set_node_attributes(mol_graph, ez_isomer_class, 'ez_isomer_class') - # set the hydrogen weight attribute - nx.set_node_attributes(mol_graph, h_weights, 'hweight') + smiles_str, bonding_descrpt, rs_isomers, ez_isomers, attributes = strip_bonding_descriptors(frag_smile) + # read an all_atom fragment using OpenSMILES definition + if all_atom: + mol_graph = read_fragment_smiles(smiles_str, + fragname, + bonding_descrpt, + rs_isomers, + ez_isomers, + attributes) # we deal with a CG resolution graph else: - mol_graph = read_cgsmiles(smile) - fragnames = nx.get_node_attributes(mol_graph, 'fragname') - nx.set_node_attributes(mol_graph, fragnames, 'atomname') - nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding') - - if all_atom: - atomnames = {node[0]: node[1]['element']+str(node[0]) for node in mol_graph.nodes(data=True)} - nx.set_node_attributes(mol_graph, atomnames, 'atomname') - - nx.set_node_attributes(mol_graph, fragname, 'fragname') - nx.set_node_attributes(mol_graph, 0, 'fragid') - # set other attributes - nx.set_node_attributes(mol_graph, attributes) + mol_graph = read_fragment_cgsmiles(smiles_str, + fragname, + bonding_descrpt, + attributes) yield fragname, mol_graph def read_fragments(fragment_str, all_atom=True, fragment_dict=None): diff --git a/cgsmiles/tests/test_cgsmile_parsing.py b/cgsmiles/tests/test_cgsmile_parsing.py index d2391de..c62f3f7 100644 --- a/cgsmiles/tests/test_cgsmile_parsing.py +++ b/cgsmiles/tests/test_cgsmile_parsing.py @@ -270,14 +270,13 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): set_charges = nx.get_node_attributes(meta_mol, 'c') assert set_charges == charges -@pytest.mark.parametrize('big_smile, smile, bonding, rs, ez, attrs, hweights',( +@pytest.mark.parametrize('big_smile, smile, bonding, rs, ez, attrs',( # smiple symmetric bonding ("[$]COC[$]", "COC", {0: ["$1"], 2: ["$1"]}, None, None, - None, None), # smiple symmetric bonding with weight ("[$]C[O;0.5]C[$]", @@ -285,16 +284,14 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): {0: ["$1"], 2: ["$1"]}, None, None, - {'w': {0: 1, 1: 0.5, 2: 1}}, - None), + {'w': {1: 0.5}}), # smiple kwarg not part of the defaults ("[$]C[O;q=4;p=s][C;q=3;p=l][$]", "C[O][C]", {0: ["$1"], 2: ["$1"]}, None, None, - {'q': {1: '4', 2: '3'}, 'p': {1: 's', 2: 'l'}}, - None), + {'q': {1: '4', 2: '3'}, 'p': {1: 's', 2: 'l'}}), # smiple symmetric bonding with weight # using cgsmiles string ("[$][#TC4][#OT1;0.5][#CD1][$]", @@ -302,17 +299,15 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): {0: ["$1"], 2: ["$1"]}, None, None, - {'w': {0: 1, 1: 0.5, 2: 1}}, - None), - # # smiple symmetric bonding with random - # # keyword argument - # ("[$][#TC4][#OT1;r=abc][#CD1][$]", - # "[#TC4][#OT1][#CD1]", - # {0: ["$1"], 2: ["$1"]}, - # None, - # None, - # {'w': {0: 1, 1: 1, 2: 1}, 'r': {1: 'abc'}}, - # None), + {'w': {1: 0.5}}), + # smiple symmetric bonding with random + # keyword argument + ("[$][#TC4][#OT1;r=abc][#CD1][$]", + "[#TC4][#OT1][#CD1]", + {0: ["$1"], 2: ["$1"]}, + None, + None, + {'r': {1: 'abc'}}), # smiple symmetric bonding with weight # using open smiles and hweights ("[$]CO[C;0.5][$]([H;0.1])[H;0.2]", @@ -320,23 +315,20 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): {0: ["$1"], 2: ["$1"]}, None, None, - {'w':{0: 1, 1: 1, 2: 0.5}}, - {2: [0.1, 0.2]}), + {'w': {2: 0.5, 3: 0.1, 4: 0.2}}), # H atom with weight goes first ("[H;0.3]C[$]O[C;0.5][$]", "[H]CO[C]", {1: ["$1"], 3: ["$1"]}, None, None, - {'w': {1: 1, 2: 1, 3: 0.5}}, - {1: [0.3]}), + {'w': {0: 0.3, 3: 0.5}}), # smiple symmetric bonding with more than one name ("[$1A]COC[$1A]", "COC", {0: ["$1A1"], 2: ["$1A1"]}, None, None, - None, None), # smiple bonding multiletter atom ("Clc[$]c[$]", @@ -344,7 +336,6 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): {1: ["$1"], 2: ["$1"]}, None, None, - None, None), # simple symmetric but with explicit hydrogen ("[$][CH2]O[CH2][$]", @@ -352,7 +343,6 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): {0: ["$1"], 2: ["$1"]}, None, None, - None, None), # smiple symmetric bonding; multiple descript ("[$]COC[$][$1]", @@ -360,7 +350,6 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): {0: ["$1"], 2: ["$1", "$11"]}, None, None, - None, None), # named different bonding descriptors ("[$1]CCCC[$2]", @@ -368,7 +357,6 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): {0: ["$11"], 3: ["$21"]}, None, None, - None, None), # ring and bonding descriptors ("[$1]CC[$2]C1CCCCC1", @@ -376,7 +364,6 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): {0: ["$11"], 1: ["$21"]}, None, None, - None, None), # bonding descript. after branch ("C(COC[$1])[$2]CCC[$3]", @@ -384,7 +371,6 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): {0: ["$21"], 3: ["$11"], 6: ["$31"]}, None, None, - None, None), # left rigth bonding desciptors ("[>]COC[<]", @@ -392,7 +378,6 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): {0: [">1"], 2: ["<1"]}, None, None, - None, None), # simple chirality in residue ("[>]C[C@](F)(B)N[<]", @@ -400,7 +385,6 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): {0: [">1"], 4: ["<1"]}, {1: ('@', [])}, None, - None, None), # simple chirality inverse in residue ("[>]C[C@@](F)(B)N[<]", @@ -408,7 +392,6 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): {0: [">1"], 4: ["<1"]}, {1: ('@@', [])}, None, - None, None), # \ fragment split ("[>]CC(\F)=[<]", @@ -416,7 +399,6 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): {0: [">1"], 1: ["<2"]}, None, {2: (2, 1, '\\')}, - None, None), # / fragment split ("[>]CC(/F)=[<]", @@ -424,7 +406,6 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): {0: [">1"], 1: ["<2"]}, None, {2: (2, 1, '/')}, - None, None), # both in one fragment ("[>]CC(/F)=C(\F)C[<]", @@ -432,11 +413,10 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): {0: [">1"], 5: ["<1"]}, None, {2: (2, 1, '/'), 4: (4, 3, '\\')}, - None, None), )) -def test_strip_bonding_descriptors(big_smile, smile, bonding, rs, ez, attrs, hweights): - new_smile, new_bonding, rs_isomers, ez_isomers, attrs_out, hweights_out = strip_bonding_descriptors(big_smile) +def test_strip_bonding_descriptors(big_smile, smile, bonding, rs, ez, attrs): + new_smile, new_bonding, rs_isomers, ez_isomers, attrs_out = strip_bonding_descriptors(big_smile) assert new_smile == smile assert new_bonding == bonding if rs: @@ -451,9 +431,6 @@ def test_strip_bonding_descriptors(big_smile, smile, bonding, rs, ez, attrs, hwe for attr, node_attrs in attrs.items(): for node, value in node_attrs.items(): assert attrs_out[node][attr] == value - if hweights: - for node, weight in hweights.items(): - assert hweights_out[node] == weight @pytest.mark.parametrize('fragment_str, nodes, edges',( # single fragment diff --git a/cgsmiles/tests/test_molecule_resolve.py b/cgsmiles/tests/test_molecule_resolve.py index 4e62d6f..db7b1cc 100644 --- a/cgsmiles/tests/test_molecule_resolve.py +++ b/cgsmiles/tests/test_molecule_resolve.py @@ -42,7 +42,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): @pytest.mark.parametrize('smile, ref_frags, elements, ref_edges, chiral, ez, weights',( # smiple linear seqeunce - ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]COC[$],#OHter=[$][O]}", + ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]COC[$],#OHter=[$]O}", # 0 1 2 3 4 5 6 7 8 [('OHter', 'O H'), ('PEO', 'C O C H H H H'), # 9 10 11 12 13 14 15 16 17 @@ -63,7 +63,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): (6, 8), (7, 9), (7, 11), (10, 11), (10, 12), (10, 13), (10, 14), (11, 15)], {}, {}, {}), # smiple linear seqeunce unconsumed bonding descrpt - ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]CO[>]C[$],#OHter=[$][O]}", + ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]CO[>]C[$],#OHter=[$]O}", # 0 1 2 3 4 5 6 7 8 [('OHter', 'O H'), ('PEO', 'C O C H H H H'), # 9 10 11 12 13 14 15 16 17 @@ -115,7 +115,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): # something with a ring # 012 34567 # 890123456 - ("{[#Hter][#PS]|2[#Hter]}.{#PS=[$]CC[$]c1ccccc1,#Hter=[$]H}", + ("{[#Hter][#PS]|2[#Hter]}.{#PS=[$]CC[$]c1ccccc1,#Hter=[$][H]}", [('Hter', 'H'), ('PS', 'C C C C C C C C H H H H H H H H'), ('PS', 'C C C C C C C C H H H H H H H H'), ('Hter', 'H')], 'H C C C C C C C C H H H H H H H H C C C C C C C C H H H H H H H H H', @@ -273,14 +273,13 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): {},{}, {0: 0.5, 1: 0.1, 5: 0.1, 4: 0.5, 8: 0.5, 9: 0.1, 12: 0.5, 13: 0.1}), # test 2 weights and hydrogen weights ("{[#SP4]1[#SP4][#SP1r]1}.{#SP4=[O;0.5]([H;0.2])[C;0.1][$]C[$]O,#SP1r=[$]OC[$]CO}", - [('SP4', 'O C C O H H H H'), ('SP4', 'O C C O H H H H'), + [('SP4', 'O H C C O H H H'), ('SP4', 'O H C C O H H H'), ('SP1r', 'O C C O H H H H')], - 'O C C O H H H H O C C O H H H H O C C O H H H H', - [(0, 1), (0, 4), (1, 2), (1, 9), (1, 5), (2, 3), (2, 16), (2, 6), - (3, 7), (8, 9), (8, 12), (9, 10), (9, 13), (10, 11), (10, 17), - (10, 14), (11, 15), (16, 17), (17, 18), (17, 20), (18, 19), - (18, 21), (18, 22), (19, 23)], - {},{}, {0: 0.5, 1: 0.1, 5: 0.1, 4: 0.2, 8: 0.5, 9: 0.1, 12: 0.2, 13: 0.1}), + 'O H C C O H H H O H C C O H H H O C C O H H H H', + [(0, 1), (0, 2), (2, 3), (2, 10), (2, 5), (3, 4), (3, 16), (3, 6), (4, 7), (8, 9), + (8, 10), (10, 11), (10, 13), (11, 12), (11, 17), (11, 14), (12, 15), (16, 17), + (17, 18), (17, 20), (18, 19), (18, 21), (18, 22), (19, 23)], + {},{}, {0: 0.5, 1: 0.2, 2: 0.1, 5: 0.1, 8: 0.5, 9: 0.2, 10: 0.1, 13: 0.1}), )) def test_all_atom_resolve_molecule(smile, ref_frags, elements, ref_edges, chiral, ez, weights): meta_mol, molecule = MoleculeResolver.from_string(smile).resolve() From 4e508f9f4ddebf60f88e000b03a1f42fd829805b Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Mon, 18 Nov 2024 12:19:16 +0100 Subject: [PATCH 15/16] change c to q and use fullnames for weight and charge --- cgsmiles/dialects.py | 25 +++++++++++-- cgsmiles/pysmiles_utils.py | 4 +- cgsmiles/tests/test_cgsmile_parsing.py | 50 ++++++++++++------------- cgsmiles/tests/test_molecule_resolve.py | 2 +- 4 files changed, 50 insertions(+), 31 deletions(-) diff --git a/cgsmiles/dialects.py b/cgsmiles/dialects.py index 36b3a4b..81b3687 100644 --- a/cgsmiles/dialects.py +++ b/cgsmiles/dialects.py @@ -17,6 +17,7 @@ def check_and_cast_types(bound_args, signature): def _parse_dialect_string(string_iterable, dialect_signature, + arg_to_fullname={}, annotation_sep_token=';', annotation_assign_token='='): """ @@ -39,6 +40,8 @@ def _parse_dialect_string(string_iterable, dialect_signature: cls.inspec.Signature a signature defineing args, kwargs, default values and types + arg_to_fullname: dict + maps arguments to more verbose descriptions annotation_sep_token: str character used to seperate key value pairs annotation_assign_token: str @@ -86,6 +89,18 @@ def _parse_dialect_string(string_iterable, applied_labels = check_and_cast_types(applied_labels, dialect_signature) applied_labels.apply_defaults() + # convert keys to more verbose names + # this should only apply to args know to + # the signature + remove_keys = [] + for old_key, new_key in arg_to_fullname.items(): + if old_key in applied_labels.arguments: + applied_labels.arguments[new_key] = applied_labels.arguments[old_key] + remove_keys.append(old_key) + + for key in remove_keys: + del applied_labels.arguments[key] + # if there are kwargs we need to put them into # output dict out_args = {} @@ -120,9 +135,11 @@ def create_dialect(default_attributes, accept_kwargs=True): # this one is for global use # it is the base CGSmiles dialect CGSMILES_DEFAULT_DIALECT = create_dialect({"fragname": "NaN", - "c": 0.0, + "q": 0.0, "w": 1.0}) -parse_graph_base_node = partial(_parse_dialect_string, dialect_signature=CGSMILES_DEFAULT_DIALECT) +parse_graph_base_node = partial(_parse_dialect_string, + dialect_signature=CGSMILES_DEFAULT_DIALECT, + arg_to_fullname = {"w": "weight", "q": "charge"}) # this one is an internal fukery until the pysmiles # base parser is available # it just strips the kwargs from fragments before @@ -130,4 +147,6 @@ def create_dialect(default_attributes, accept_kwargs=True): # in case of cgsmiles fragments it is a bit doing # double the work fragment_base = create_dialect({"w": 1.0}, accept_kwargs=True) -_fragment_node_parser = partial(_parse_dialect_string, dialect_signature=fragment_base) +_fragment_node_parser = partial(_parse_dialect_string, + dialect_signature=fragment_base, + arg_to_fullname = {"w": "weight"}) diff --git a/cgsmiles/pysmiles_utils.py b/cgsmiles/pysmiles_utils.py index 61ace56..69f76ab 100644 --- a/cgsmiles/pysmiles_utils.py +++ b/cgsmiles/pysmiles_utils.py @@ -32,7 +32,7 @@ def compute_mass(input_molecule): mass += pysmiles.PTE[element]['AtomicMass'] return mass -def rebuild_h_atoms(mol_graph, copy_attrs=['fragid', 'fragname', 'w']): +def rebuild_h_atoms(mol_graph, copy_attrs=['fragid', 'fragname', 'weight']): """ Helper function which add hydrogen atoms to the molecule graph. @@ -212,7 +212,7 @@ def read_fragment_smiles(smiles_str, # set some default values nx.set_node_attributes(mol_graph, fragname, 'fragname') nx.set_node_attributes(mol_graph, 0, 'fragid') - nx.set_node_attributes(mol_graph, 1, 'w') + nx.set_node_attributes(mol_graph, 1, 'weight') # we add all bonding descriptors to the molecule nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding') diff --git a/cgsmiles/tests/test_cgsmile_parsing.py b/cgsmiles/tests/test_cgsmile_parsing.py index c62f3f7..ced9802 100644 --- a/cgsmiles/tests/test_cgsmile_parsing.py +++ b/cgsmiles/tests/test_cgsmile_parsing.py @@ -17,7 +17,7 @@ [(0, 1), (1, 2)], [1, 1]), # smiple charges with keyword - ("{[#PMA;c=+1][#PEO][#PMA;c=-0.25]}", + ("{[#PMA;q=+1][#PEO][#PMA;q=-0.25]}", ["PMA", "PEO", "PMA"], {0: 1.0, 1: 0.0, 2:-0.25}, [(0, 1), (1, 2)], @@ -267,7 +267,7 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): assert nodes == list(fragnames.values()) if charges: - set_charges = nx.get_node_attributes(meta_mol, 'c') + set_charges = nx.get_node_attributes(meta_mol, 'charge') assert set_charges == charges @pytest.mark.parametrize('big_smile, smile, bonding, rs, ez, attrs',( @@ -284,7 +284,7 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): {0: ["$1"], 2: ["$1"]}, None, None, - {'w': {1: 0.5}}), + {'weight': {1: 0.5}}), # smiple kwarg not part of the defaults ("[$]C[O;q=4;p=s][C;q=3;p=l][$]", "C[O][C]", @@ -299,7 +299,7 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): {0: ["$1"], 2: ["$1"]}, None, None, - {'w': {1: 0.5}}), + {'weight': {1: 0.5}}), # smiple symmetric bonding with random # keyword argument ("[$][#TC4][#OT1;r=abc][#CD1][$]", @@ -315,14 +315,14 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders): {0: ["$1"], 2: ["$1"]}, None, None, - {'w': {2: 0.5, 3: 0.1, 4: 0.2}}), + {'weight': {2: 0.5, 3: 0.1, 4: 0.2}}), # H atom with weight goes first ("[H;0.3]C[$]O[C;0.5][$]", "[H]CO[C]", {1: ["$1"], 3: ["$1"]}, None, None, - {'w': {0: 0.3, 3: 0.5}}), + {'weight': {0: 0.3, 3: 0.5}}), # smiple symmetric bonding with more than one name ("[$1A]COC[$1A]", "COC", @@ -435,50 +435,50 @@ def test_strip_bonding_descriptors(big_smile, smile, bonding, rs, ez, attrs): @pytest.mark.parametrize('fragment_str, nodes, edges',( # single fragment ("{#PEO=[$]COC[$]}", - {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "w": 1}), - (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "w": 1}), - (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "w": 1}), + {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "weight": 1}), + (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "weight": 1}), + (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "weight": 1}), )}, {"PEO": [(0, 1), (1, 2)]}), # single fragment but with explicit hydrogen in smiles ("{#PEO=[$][CH2]O[CH2][$]}", - {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2, "w": 1}), - (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "w": 1}), - (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2, "w": 1}), + {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2, "weight": 1}), + (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "weight": 1}), + (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2, "weight": 1}), )}, {"PEO": [(0, 1), (1, 2)]}), # test NH3 terminal ("{#AMM=N[$]}", - {"AMM": ((0, {"atomname": "N0", "fragname": "AMM", "bonding": ["$1"], "element": "N", "hcount": 3, "w": 1}), + {"AMM": ((0, {"atomname": "N0", "fragname": "AMM", "bonding": ["$1"], "element": "N", "hcount": 3, "weight": 1}), )}, {"AMM": []}), # single fragment + 1 terminal (i.e. only 1 bonding descrpt ("{#PEO=[$]COC[$],#OHter=[$][OH]}", - {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "w": 1}), - (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "w": 1}), - (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "w": 1}), + {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "weight": 1}), + (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "weight": 1}), + (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "weight": 1}), ), - "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "w": 1}),)}, + "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "weight": 1}),)}, {"PEO": [(0, 1), (1, 2)], "OHter": []}), # single fragment + 1 terminal but multiple bond descritp. # this adjust the hydrogen count ("{#PEO=[$]COC[$][$1],#OHter=[$][OH]}", - {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "w": 1}), - (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "w": 1}), - (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1", "$11"], "element": "C", "hcount": 3, "w": 1}), + {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "weight": 1}), + (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "weight": 1}), + (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1", "$11"], "element": "C", "hcount": 3, "weight": 1}), ), - "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "hcount": 1, "w": 1}),)}, + "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "hcount": 1, "weight": 1}),)}, {"PEO": [(0, 1), (1, 2)], "OHter": []}), # single fragment + 1 terminal but multiple bond descritp. # but explicit hydrogen in the smiles string ("{#PEO=[$][CH2]O[CH2][$][$1],#OHter=[$][OH]}", - {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2, "w": 1}), - (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "w": 1}), - (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1", "$11"], "element": "C", "hcount": 2, "w": 1}), + {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2, "weight": 1}), + (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "weight": 1}), + (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1", "$11"], "element": "C", "hcount": 2, "weight": 1}), ), - "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "hcount": 1, "w": 1}), + "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "hcount": 1, "weight": 1}), )}, {"PEO": [(0, 1), (1, 2),], "OHter": []}), diff --git a/cgsmiles/tests/test_molecule_resolve.py b/cgsmiles/tests/test_molecule_resolve.py index db7b1cc..ebc8299 100644 --- a/cgsmiles/tests/test_molecule_resolve.py +++ b/cgsmiles/tests/test_molecule_resolve.py @@ -319,7 +319,7 @@ def _ele_match(n1, n2): if weights: mol_weights = {node: 1 for node in ref_graph} mol_weights.update(weights) - weights_assigned = nx.get_node_attributes(molecule, 'w') + weights_assigned = nx.get_node_attributes(molecule, 'weight') assert mol_weights == weights_assigned @pytest.mark.parametrize('case, cgsmiles_str, ref_string',( From 64027bcbdda4dd32bced4ef3a0b07550550beaa7 Mon Sep 17 00:00:00 2001 From: "Dr. Fabian Grunewald" <32294573+fgrunewald@users.noreply.github.com> Date: Fri, 22 Nov 2024 14:13:32 +0100 Subject: [PATCH 16/16] Update cgsmiles/dialects.py Co-authored-by: Peter C Kroon --- cgsmiles/dialects.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/cgsmiles/dialects.py b/cgsmiles/dialects.py index 81b3687..5404c3b 100644 --- a/cgsmiles/dialects.py +++ b/cgsmiles/dialects.py @@ -92,14 +92,9 @@ def _parse_dialect_string(string_iterable, # convert keys to more verbose names # this should only apply to args know to # the signature - remove_keys = [] for old_key, new_key in arg_to_fullname.items(): if old_key in applied_labels.arguments: - applied_labels.arguments[new_key] = applied_labels.arguments[old_key] - remove_keys.append(old_key) - - for key in remove_keys: - del applied_labels.arguments[key] + applied_labels.arguments[new_key] = applied_labels.arguments.pop(old_key) # if there are kwargs we need to put them into # output dict