diff --git a/cgsmiles/pysmiles_utils.py b/cgsmiles/pysmiles_utils.py index 83051bd..419efb3 100644 --- a/cgsmiles/pysmiles_utils.py +++ b/cgsmiles/pysmiles_utils.py @@ -83,6 +83,11 @@ def rebuild_h_atoms(mol_graph, keep_bonding=False): ref_node = next(mol_graph.neighbors(node)) mol_graph.nodes[node]["fragid"] = mol_graph.nodes[ref_node]["fragid"] mol_graph.nodes[node]["fragname"] = mol_graph.nodes[ref_node]["fragname"] + if mol_graph.nodes[node].get("element", "*") == "H": + # make sure the weights are copied for implicit h-atoms + anchor = list(mol_graph.neighbors(node))[0] + weight = mol_graph.nodes[anchor].get("weight", 1) + mol_graph.nodes[node]["weight"] = weight def annotate_ez_isomers(molecule): """ diff --git a/cgsmiles/read_fragments.py b/cgsmiles/read_fragments.py index f065317..fa28829 100644 --- a/cgsmiles/read_fragments.py +++ b/cgsmiles/read_fragments.py @@ -95,6 +95,31 @@ def collect_ring_number(smile_iter, token, node_count, rings): return smile_iter, token, partial_str, rings +def get_weight(smile_iter): + """ + Extracts weights given to atoms/nodes in + fragments. The iter should be advanced + up to the weight marker ;. + + Parameters + ---------- + smile_iter: class.PeekIter + + Returns + ------- + float: + the weight + PeekIter + the advanced iter object + """ + num = [] + for digit in smile_iter: + num.append(digit) + if smile_iter.peek() in [']', '@', 'H']: + break + out = float("".join(num)) + return out, smile_iter + def strip_bonding_descriptors(fragment_string): """ Processes a CGSmiles fragment string by @@ -122,6 +147,7 @@ def strip_bonding_descriptors(fragment_string): rings = defaultdict(list) ez_isomer_atoms = {} rs_isomers = {} + weights = {} smile = "" node_count = 0 prev_node = 0 @@ -147,6 +173,8 @@ def strip_bonding_descriptors(fragment_string): bonding_descrpt[prev_node].append(bond_descrp + str(order)) else: atom = token + # set the default weight + weights[node_count] = 1 while peek != ']': # deal with rs chirality if peek == '@': @@ -154,6 +182,10 @@ def strip_bonding_descriptors(fragment_string): if smile_iter.peek() == '@': chiral_token = '@' + next(smile_iter) rs_isomers[node_count] = (chiral_token, []) + # we have weights + elif peek == ';': + weight, smile_iter = get_weight(smile_iter) + weights[node_count] = weight else: atom += peek peek = next(smile_iter) @@ -193,6 +225,8 @@ def strip_bonding_descriptors(fragment_string): smile += token current_order = None prev_node = node_count + # set default weight + weights[node_count] = 1 node_count += 1 # we need to annotate rings to the chiral isomers @@ -201,7 +235,8 @@ def strip_bonding_descriptors(fragment_string): if node in ring_nodes: bonded_node = _find_bonded_ring_node(ring_nodes, node) rs_isomers[node][1].append(bonded_node) - return smile, bonding_descrpt, rs_isomers, ez_isomer_atoms + + return smile, bonding_descrpt, rs_isomers, ez_isomer_atoms, weights def fragment_iter(fragment_str, all_atom=True): """ @@ -230,8 +265,8 @@ def fragment_iter(fragment_str, all_atom=True): for fragment in fragment_str[1:-1].split(','): delim = fragment.find('=', 0) fragname = fragment[1:delim] - big_smile = fragment[delim+1:] - smile, bonding_descrpt, rs_isomers, ez_isomers = strip_bonding_descriptors(big_smile) + frag_smile = fragment[delim+1:] + smile, bonding_descrpt, rs_isomers, ez_isomers, weights = strip_bonding_descriptors(frag_smile) if smile == "H": mol_graph = nx.Graph() mol_graph.add_node(0, element="H", bonding=bonding_descrpt[0]) @@ -258,6 +293,7 @@ def fragment_iter(fragment_str, all_atom=True): nx.set_node_attributes(mol_graph, fragname, 'fragname') nx.set_node_attributes(mol_graph, 0, 'fragid') + nx.set_node_attributes(mol_graph, weights, 'weight') yield fragname, mol_graph def read_fragments(fragment_str, all_atom=True, fragment_dict=None): diff --git a/cgsmiles/tests/test_molecule_resolve.py b/cgsmiles/tests/test_molecule_resolve.py index afdbf89..14b1d70 100644 --- a/cgsmiles/tests/test_molecule_resolve.py +++ b/cgsmiles/tests/test_molecule_resolve.py @@ -40,7 +40,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): assert new_btypes == btypes -@pytest.mark.parametrize('smile, ref_frags, elements, ref_edges, chiral, ez',( +@pytest.mark.parametrize('smile, ref_frags, elements, ref_edges, chiral, ez, weights',( # smiple linear seqeunce ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]COC[$],#OHter=[$][O]}", # 0 1 2 3 4 5 6 7 8 @@ -51,7 +51,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7), (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13), (11, 14), (11, 15), (11, 16), (16, 17)], - {}, {}), + {}, {}, {}), # smiple linear seqeunce with bond-order in link ("{[#TC1][#TC4][#TC1]}.{#TC1=[$1]=CC=[$2],#TC4=[$1]=CC=[$2]}", # 0 1 2 3 4 5 6 7 8 9 @@ -61,7 +61,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): 'C C H H H H C C H H C C H H H H', [(0, 1), (0, 2), (1, 3), (1, 4), (1, 5), (0, 6), (6, 7), (6, 8), (7, 9), (7, 11), (10, 11), (10, 12), (10, 13), - (10, 14), (11, 15)], {}, {}), + (10, 14), (11, 15)], {}, {}, {}), # smiple linear seqeunce unconsumed bonding descrpt ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]CO[>]C[$],#OHter=[$][O]}", # 0 1 2 3 4 5 6 7 8 @@ -71,7 +71,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): 'O H C O C H H H H C O C H H H H O H', [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7), (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13), - (11, 14), (11, 15), (11, 16), (16, 17)], {}, {}), + (11, 14), (11, 15), (11, 16), (16, 17)], {}, {}, {}), # smiple linear seqeunce with ionic bond ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]COC[$],#OHter=[$][O-].[Na+]}", # 0 1 2 3 4 5 6 7 8 @@ -81,7 +81,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): 'O Na C O C H H H H C O C H H H H O Na', [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7), (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13), - (11, 14), (11, 15), (11, 16), (16, 17)], {}, {}), + (11, 14), (11, 15), (11, 16), (16, 17)], {}, {}, {}), # smiple linear seqeunce with ionic ending ("{[#OH][#PEO]|2[#ON]}.{#PEO=[$]COC[$],#OH=[$]O,#ON=[$][O-]}", # 0 1 2 3 4 5 6 7 8 @@ -91,7 +91,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): 'O H C O C H H H H C O C H H H H O', [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7), (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13), - (11, 14), (11, 15), (11, 16)], {}, {}), + (11, 14), (11, 15), (11, 16)], {}, {}, {}), # uncomsumed bonding IDs; note that this is not the same # molecule as previous test case. Here one of the OH branches # and replaces an CH2 group with CH-OH @@ -103,7 +103,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): 'O H C O C H H H H C O C H H H H O H', [(0, 1), (0, 2), (2, 3), (2, 5), (2, 11), (3, 4), (4, 6), (4, 7), (4, 8), (9, 10), (9, 12), (9, 13), - (10, 11), (11, 15), (11, 14), (9, 16), (16, 17)], {}, {}), + (10, 11), (11, 15), (11, 14), (9, 16), (16, 17)], {}, {}, {}), # simple branched sequence ("{[#Hter][#PE]([#PEO][#Hter])[#PE]([#PEO][#Hter])[#Hter]}.{#Hter=[$]H,#PE=[$]CC[$][$],#PEO=[$]COC[$]}", [('Hter', 'H'), ('PE', 'C C H H H'), ('PEO', 'C O C H H H H'), ('Hter', 'H'), @@ -111,7 +111,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): 'H C C H H H C O C H H H H H C C H H H C O C H H H H H H', [(0, 1), (1, 2), (1, 3), (1, 4), (2, 5), (2, 6), (2, 14), (6, 7), (6, 9), (6, 10), (7, 8), (8, 11), (8, 12), (8, 13), (14, 15), (14, 16), (14, 17), (15, 18), (15, 19), (15, 27), - (19, 20), (19, 22), (19, 23), (20, 21), (21, 24), (21, 25), (21, 26)], {}, {}), + (19, 20), (19, 22), (19, 23), (20, 21), (21, 24), (21, 25), (21, 26)], {}, {}, {}), # something with a ring # 012 34567 # 890123456 @@ -124,7 +124,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): (6, 14), (7, 8), (7, 15), (8, 16), (17, 18), (17, 25), (17, 26), (18, 19), (18, 27), (18, 33), (19, 20), (19, 24), (20, 21), (20, 28), (21, 22), (21, 29), (22, 23), (22, 30), - (23, 24), (23, 31), (24, 32)], {}, {}), + (23, 24), (23, 31), (24, 32)], {}, {}, {}), # something more complicated branched # here we have multiple bonding descriptors # # despite being the same residue we have 3 fragments after adding hydrgens @@ -146,7 +146,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): [('A', 'O H C H H'), ('B', 'C H H C H H H'),], 'O H C H H C H H H', [(0, 1), (0, 2), (2, 3), (2, 4), (2, 5), - (5, 6), (5, 7), (5, 8)], {}, {}), + (5, 6), (5, 7), (5, 8)], {}, {}, {}), # smiple squash operator; unconsumed operators ("{[#A][#B]}.{#A=OC[!],#B=[$][!]CC}", # 0 1 2 3 4 1 5 3 4 6 7 8 @@ -157,7 +157,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): [('A', 'O H C H H'), ('B', 'C H H C H H H'),], 'O H C H H C H H H', [(0, 1), (0, 2), (2, 3), (2, 4), (2, 5), - (5, 6), (5, 7), (5, 8)], {}, {}), + (5, 6), (5, 7), (5, 8)], {}, {}, {}), # smiple squash operator; plus connect operator ("{[#A][#B][#C]}.{#A=OC[!],#B=[$][!]CC,#C=[$]O}", # 0 1 2 3 4 1 5 3 4 6 7 8 @@ -168,7 +168,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): [('A', 'O H C H'), ('B', 'C H C H H H'), ('C', 'O H')], 'O H C H C H H H O H', [(0, 1), (0, 2), (2, 3), (2, 4), - (4, 5), (4, 6), (4, 7), (2, 8), (8, 9)], {}, {}), + (4, 5), (4, 6), (4, 7), (2, 8), (8, 9)], {}, {}, {}), # THF like test case with double edge and squash operator ("{[#A]=[#B]}.{#A=[!]COC[!],#B=[!]CCCC[!]}", [('A', 'O C C H H H H'), @@ -176,7 +176,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): 'O C C H H H H C C H H H H', [(0, 2), (0, 3), (2, 4), (2, 5), (3, 6), (3, 7), (2, 8), (3, 9), - (8, 9), (9, 12), (9, 13), (8, 10), (8, 11)], {}, {}), + (8, 9), (9, 12), (9, 13), (8, 10), (8, 11)], {}, {}, {}), # Toluene like test case with squash operator and aromaticity ("{[#SC3]1[#TC5][#TC5]1}.{#SC3=Cc(c[!])c[!],#TC5=[!]ccc[!]}", [('SC3', 'C C H H H C H C H'), @@ -184,7 +184,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): 'C C H H H C H C H C H C H C H', [(0, 1), (0, 2), (0, 3), (0, 4), (1, 5), (1, 7), (5, 9), (5, 6), (7, 13), (7, 8), - (9, 11), (9, 10), (11, 13), (11, 12), (13, 14)], {}, {}), + (9, 11), (9, 10), (11, 13), (11, 12), (13, 14)], {}, {}, {}), # simple chirality assigment with rings ("{[#GLC]}.{#GLC=C([C@@H]1[C@H]([C@@H]([C@H]([C@H](O1)O)O)O)O)O}", # 0 1 2 3 @@ -194,7 +194,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): (2, 15), (3, 4), (3, 9), (3, 16), (4, 5), (4, 8), (4, 17), (5, 6), (5, 7), (5, 18), (7, 19), (8, 20), (9, 21), (10, 22), (11, 23)], {1: (6, 14, 2, 0), 2: (1, 15, 3, 10), 3: (2, 16, 9, 4), - 4: (3, 17, 5, 8), 5: (4, 18, 6, 7)}, {}), + 4: (3, 17, 5, 8), 5: (4, 18, 6, 7)}, {}, {}), # simple chirality assigment between fragments ("{[#A][#B][#C]}.{#A=O[>],#C=O[<],#B=[<]C[C@H][>]C(=O)OC}", # 0 1 2 3 @@ -204,7 +204,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): [(0, 1), (0, 2), (2, 3), (2, 4), (2, 5), (5, 6), (5, 7), (7, 8), (7, 9), (9, 10), (10, 11), (10, 12), (10, 13), (5, 14), (14, 15)], - {3: (2, 10, 4, 14)}, {}), + {3: (2, 10, 4, 14)}, {}, {}), # simple chirality assigment between fragments inv ("{[#A][#B][#C]}.{#A=O[>],#C=O[<],#B=[<]C[C@@H][>]C(=O)OC}", # 0 1 2 3 @@ -214,21 +214,21 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): [(0, 1), (0, 2), (2, 3), (2, 4), (2, 5), (5, 6), (5, 7), (7, 8), (7, 9), (9, 10), (10, 11), (10, 12), (10, 13), (5, 14), (14, 15)], - {3: (2, 10, 14, 4)}, {}), + {3: (2, 10, 14, 4)}, {}, {}), # smiple ez isomerism assigment between fragments inv ("{[#A][#B]}.{#A=CC(/F)=[$],#B=[$]=C(\F)C}", [('A', 'C C F H H H'), ('B', 'C F C H H H')], 'C C F H H H F C C H H H', [(0, 1), (1, 2), (0, 3), (0, 4), (0, 5), (1, 7), (7, 6), (7, 8), (8, 9), (8, 10), (8, 11)], - {}, {2: (2, 1, 6, 7, 'trans'), 7: (7, 6, 1, 2, 'trans')}), + {}, {2: (2, 1, 6, 7, 'trans'), 7: (7, 6, 1, 2, 'trans')}, {}), # simple ez isomerism assigment between fragments inv ("{[#A][#B]}.{#A=CC(/F)=[$],#B=[$]=C(/F)C}", [('A', 'C C F H H H'), ('B', 'C F C H H H')], 'C C F H H H F C C H H H', [(0, 1), (1, 2), (0, 3), (0, 4), (0, 5), (1, 7), (7, 6), (7, 8), (8, 9), (8, 10), (8, 11)], - {}, {2: (2, 1, 6, 7, 'cis'), 7: (7, 6, 1, 2, 'cis')}), + {}, {2: (2, 1, 6, 7, 'cis'), 7: (7, 6, 1, 2, 'cis')}, {}), # test skip virtual nodes ("{[#SP4]1.2[#SP4].3[#SP1r]1.[#TC4]23}.{#SP4=OC[$]C[$]O,#SP1r=[$]OC[$]CO}", [('SP4', 'O C C O H H H H'), ('SP4', 'O C C O H H H H'), @@ -238,9 +238,29 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes): (3, 7), (8, 9), (8, 12), (9, 10), (9, 13), (10, 11), (10, 17), (10, 14), (11, 15), (16, 17), (17, 18), (17, 20), (18, 19), (18, 21), (18, 22), (19, 23)], - {},{}), + {},{}, {}), + # test weights + ("{[#SP4]1[#SP4][#SP1r]1}.{#SP4=[OH;0.5]C[$]C[$]O,#SP1r=[$]OC[$]CO}", + [('SP4', 'O C C O H H H H'), ('SP4', 'O C C O H H H H'), + ('SP1r', 'O C C O H H H H')], + 'O C C O H H H H O C C O H H H H O C C O H H H H', + [(0, 1), (0, 4), (1, 2), (1, 9), (1, 5), (2, 3), (2, 16), (2, 6), + (3, 7), (8, 9), (8, 12), (9, 10), (9, 13), (10, 11), (10, 17), + (10, 14), (11, 15), (16, 17), (17, 18), (17, 20), (18, 19), + (18, 21), (18, 22), (19, 23)], + {},{}, {0: 0.5, 4: 0.5, 8: 0.5, 12: 0.5}), + # test 2 weights + ("{[#SP4]1[#SP4][#SP1r]1}.{#SP4=[OH;0.5][C;0.1][$]C[$]O,#SP1r=[$]OC[$]CO}", + [('SP4', 'O C C O H H H H'), ('SP4', 'O C C O H H H H'), + ('SP1r', 'O C C O H H H H')], + 'O C C O H H H H O C C O H H H H O C C O H H H H', + [(0, 1), (0, 4), (1, 2), (1, 9), (1, 5), (2, 3), (2, 16), (2, 6), + (3, 7), (8, 9), (8, 12), (9, 10), (9, 13), (10, 11), (10, 17), + (10, 14), (11, 15), (16, 17), (17, 18), (17, 20), (18, 19), + (18, 21), (18, 22), (19, 23)], + {},{}, {0: 0.5, 1: 0.1, 5: 0.1, 4: 0.5, 8: 0.5, 9: 0.1, 12: 0.5, 13: 0.1}), )) -def test_all_atom_resolve_molecule(smile, ref_frags, elements, ref_edges, chiral, ez): +def test_all_atom_resolve_molecule(smile, ref_frags, elements, ref_edges, chiral, ez, weights): meta_mol, molecule = MoleculeResolver.from_string(smile).resolve() # loop and compare fragments first @@ -278,6 +298,12 @@ def _ele_match(n1, n2): if ez: ez_assigned = nx.get_node_attributes(molecule, 'ez_isomer') assert ez == ez_assigned + # check weights + if weights: + mol_weights = {node: 1 for node in ref_graph} + mol_weights.update(weights) + weights_assigned = nx.get_node_attributes(molecule, 'weight') + assert mol_weights == weights_assigned @pytest.mark.parametrize('case, cgsmiles_str, ref_string',( # case 1: here only the meta-graph is described by the