Skip to content

Commit

Permalink
implement weight annotation
Browse files Browse the repository at this point in the history
  • Loading branch information
fgrunewald committed Oct 14, 2024
1 parent c084a17 commit d627ac2
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 24 deletions.
5 changes: 5 additions & 0 deletions cgsmiles/pysmiles_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,11 @@ def rebuild_h_atoms(mol_graph, keep_bonding=False):
ref_node = next(mol_graph.neighbors(node))
mol_graph.nodes[node]["fragid"] = mol_graph.nodes[ref_node]["fragid"]
mol_graph.nodes[node]["fragname"] = mol_graph.nodes[ref_node]["fragname"]
if mol_graph.nodes[node].get("element", "*") == "H":
# make sure the weights are copied for implicit h-atoms
anchor = list(mol_graph.neighbors(node))[0]
weight = mol_graph.nodes[anchor].get("weight", 1)
mol_graph.nodes[node]["weight"] = weight

def annotate_ez_isomers(molecule):
"""
Expand Down
42 changes: 39 additions & 3 deletions cgsmiles/read_fragments.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,31 @@ def collect_ring_number(smile_iter, token, node_count, rings):

return smile_iter, token, partial_str, rings

def get_weight(smile_iter):
"""
Extracts weights given to atoms/nodes in
fragments. The iter should be advanced
up to the weight marker ;.
Parameters
----------
smile_iter: class.PeekIter
Returns
-------
float:
the weight
PeekIter
the advanced iter object
"""
num = []
for digit in smile_iter:
num.append(digit)
if smile_iter.peek() in [']', '@', 'H']:
break
out = float("".join(num))
return out, smile_iter

def strip_bonding_descriptors(fragment_string):
"""
Processes a CGSmiles fragment string by
Expand Down Expand Up @@ -122,6 +147,7 @@ def strip_bonding_descriptors(fragment_string):
rings = defaultdict(list)
ez_isomer_atoms = {}
rs_isomers = {}
weights = {}
smile = ""
node_count = 0
prev_node = 0
Expand All @@ -147,13 +173,19 @@ def strip_bonding_descriptors(fragment_string):
bonding_descrpt[prev_node].append(bond_descrp + str(order))
else:
atom = token
# set the default weight
weights[node_count] = 1
while peek != ']':
# deal with rs chirality
if peek == '@':
chiral_token = peek
if smile_iter.peek() == '@':
chiral_token = '@' + next(smile_iter)
rs_isomers[node_count] = (chiral_token, [])
# we have weights
elif peek == ';':
weight, smile_iter = get_weight(smile_iter)
weights[node_count] = weight
else:
atom += peek
peek = next(smile_iter)
Expand Down Expand Up @@ -193,6 +225,8 @@ def strip_bonding_descriptors(fragment_string):
smile += token
current_order = None
prev_node = node_count
# set default weight
weights[node_count] = 1
node_count += 1

# we need to annotate rings to the chiral isomers
Expand All @@ -201,7 +235,8 @@ def strip_bonding_descriptors(fragment_string):
if node in ring_nodes:
bonded_node = _find_bonded_ring_node(ring_nodes, node)
rs_isomers[node][1].append(bonded_node)
return smile, bonding_descrpt, rs_isomers, ez_isomer_atoms

return smile, bonding_descrpt, rs_isomers, ez_isomer_atoms, weights

def fragment_iter(fragment_str, all_atom=True):
"""
Expand Down Expand Up @@ -230,8 +265,8 @@ def fragment_iter(fragment_str, all_atom=True):
for fragment in fragment_str[1:-1].split(','):
delim = fragment.find('=', 0)
fragname = fragment[1:delim]
big_smile = fragment[delim+1:]
smile, bonding_descrpt, rs_isomers, ez_isomers = strip_bonding_descriptors(big_smile)
frag_smile = fragment[delim+1:]
smile, bonding_descrpt, rs_isomers, ez_isomers, weights = strip_bonding_descriptors(frag_smile)
if smile == "H":
mol_graph = nx.Graph()
mol_graph.add_node(0, element="H", bonding=bonding_descrpt[0])
Expand All @@ -258,6 +293,7 @@ def fragment_iter(fragment_str, all_atom=True):

nx.set_node_attributes(mol_graph, fragname, 'fragname')
nx.set_node_attributes(mol_graph, 0, 'fragid')
nx.set_node_attributes(mol_graph, weights, 'weight')
yield fragname, mol_graph

def read_fragments(fragment_str, all_atom=True, fragment_dict=None):
Expand Down
68 changes: 47 additions & 21 deletions cgsmiles/tests/test_molecule_resolve.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
assert new_btypes == btypes


@pytest.mark.parametrize('smile, ref_frags, elements, ref_edges, chiral, ez',(
@pytest.mark.parametrize('smile, ref_frags, elements, ref_edges, chiral, ez, weights',(
# smiple linear seqeunce
("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]COC[$],#OHter=[$][O]}",
# 0 1 2 3 4 5 6 7 8
Expand All @@ -51,7 +51,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
[(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7),
(4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13),
(11, 14), (11, 15), (11, 16), (16, 17)],
{}, {}),
{}, {}, {}),
# smiple linear seqeunce with bond-order in link
("{[#TC1][#TC4][#TC1]}.{#TC1=[$1]=CC=[$2],#TC4=[$1]=CC=[$2]}",
# 0 1 2 3 4 5 6 7 8 9
Expand All @@ -61,7 +61,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
'C C H H H H C C H H C C H H H H',
[(0, 1), (0, 2), (1, 3), (1, 4), (1, 5), (0, 6), (6, 7),
(6, 8), (7, 9), (7, 11), (10, 11), (10, 12), (10, 13),
(10, 14), (11, 15)], {}, {}),
(10, 14), (11, 15)], {}, {}, {}),
# smiple linear seqeunce unconsumed bonding descrpt
("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]CO[>]C[$],#OHter=[$][O]}",
# 0 1 2 3 4 5 6 7 8
Expand All @@ -71,7 +71,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
'O H C O C H H H H C O C H H H H O H',
[(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7),
(4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13),
(11, 14), (11, 15), (11, 16), (16, 17)], {}, {}),
(11, 14), (11, 15), (11, 16), (16, 17)], {}, {}, {}),
# smiple linear seqeunce with ionic bond
("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]COC[$],#OHter=[$][O-].[Na+]}",
# 0 1 2 3 4 5 6 7 8
Expand All @@ -81,7 +81,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
'O Na C O C H H H H C O C H H H H O Na',
[(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7),
(4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13),
(11, 14), (11, 15), (11, 16), (16, 17)], {}, {}),
(11, 14), (11, 15), (11, 16), (16, 17)], {}, {}, {}),
# smiple linear seqeunce with ionic ending
("{[#OH][#PEO]|2[#ON]}.{#PEO=[$]COC[$],#OH=[$]O,#ON=[$][O-]}",
# 0 1 2 3 4 5 6 7 8
Expand All @@ -91,7 +91,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
'O H C O C H H H H C O C H H H H O',
[(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7),
(4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13),
(11, 14), (11, 15), (11, 16)], {}, {}),
(11, 14), (11, 15), (11, 16)], {}, {}, {}),
# uncomsumed bonding IDs; note that this is not the same
# molecule as previous test case. Here one of the OH branches
# and replaces an CH2 group with CH-OH
Expand All @@ -103,15 +103,15 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
'O H C O C H H H H C O C H H H H O H',
[(0, 1), (0, 2), (2, 3), (2, 5), (2, 11), (3, 4),
(4, 6), (4, 7), (4, 8), (9, 10), (9, 12), (9, 13),
(10, 11), (11, 15), (11, 14), (9, 16), (16, 17)], {}, {}),
(10, 11), (11, 15), (11, 14), (9, 16), (16, 17)], {}, {}, {}),
# simple branched sequence
("{[#Hter][#PE]([#PEO][#Hter])[#PE]([#PEO][#Hter])[#Hter]}.{#Hter=[$]H,#PE=[$]CC[$][$],#PEO=[$]COC[$]}",
[('Hter', 'H'), ('PE', 'C C H H H'), ('PEO', 'C O C H H H H'), ('Hter', 'H'),
('PE', 'C C H H H'), ('PEO', 'C O C H H H H'), ('Hter', 'H'), ('Hter', 'H')],
'H C C H H H C O C H H H H H C C H H H C O C H H H H H H',
[(0, 1), (1, 2), (1, 3), (1, 4), (2, 5), (2, 6), (2, 14), (6, 7), (6, 9), (6, 10), (7, 8),
(8, 11), (8, 12), (8, 13), (14, 15), (14, 16), (14, 17), (15, 18), (15, 19), (15, 27),
(19, 20), (19, 22), (19, 23), (20, 21), (21, 24), (21, 25), (21, 26)], {}, {}),
(19, 20), (19, 22), (19, 23), (20, 21), (21, 24), (21, 25), (21, 26)], {}, {}, {}),
# something with a ring
# 012 34567
# 890123456
Expand All @@ -124,7 +124,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
(6, 14), (7, 8), (7, 15), (8, 16), (17, 18), (17, 25),
(17, 26), (18, 19), (18, 27), (18, 33), (19, 20), (19, 24),
(20, 21), (20, 28), (21, 22), (21, 29), (22, 23), (22, 30),
(23, 24), (23, 31), (24, 32)], {}, {}),
(23, 24), (23, 31), (24, 32)], {}, {}, {}),
# something more complicated branched
# here we have multiple bonding descriptors
# # despite being the same residue we have 3 fragments after adding hydrgens
Expand All @@ -146,7 +146,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
[('A', 'O H C H H'), ('B', 'C H H C H H H'),],
'O H C H H C H H H',
[(0, 1), (0, 2), (2, 3), (2, 4), (2, 5),
(5, 6), (5, 7), (5, 8)], {}, {}),
(5, 6), (5, 7), (5, 8)], {}, {}, {}),
# smiple squash operator; unconsumed operators
("{[#A][#B]}.{#A=OC[!],#B=[$][!]CC}",
# 0 1 2 3 4 1 5 3 4 6 7 8
Expand All @@ -157,7 +157,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
[('A', 'O H C H H'), ('B', 'C H H C H H H'),],
'O H C H H C H H H',
[(0, 1), (0, 2), (2, 3), (2, 4), (2, 5),
(5, 6), (5, 7), (5, 8)], {}, {}),
(5, 6), (5, 7), (5, 8)], {}, {}, {}),
# smiple squash operator; plus connect operator
("{[#A][#B][#C]}.{#A=OC[!],#B=[$][!]CC,#C=[$]O}",
# 0 1 2 3 4 1 5 3 4 6 7 8
Expand All @@ -168,23 +168,23 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
[('A', 'O H C H'), ('B', 'C H C H H H'), ('C', 'O H')],
'O H C H C H H H O H',
[(0, 1), (0, 2), (2, 3), (2, 4),
(4, 5), (4, 6), (4, 7), (2, 8), (8, 9)], {}, {}),
(4, 5), (4, 6), (4, 7), (2, 8), (8, 9)], {}, {}, {}),
# THF like test case with double edge and squash operator
("{[#A]=[#B]}.{#A=[!]COC[!],#B=[!]CCCC[!]}",
[('A', 'O C C H H H H'),
('B', 'C C H H H H C C H H H H')],
'O C C H H H H C C H H H H',
[(0, 2), (0, 3), (2, 4), (2, 5),
(3, 6), (3, 7), (2, 8), (3, 9),
(8, 9), (9, 12), (9, 13), (8, 10), (8, 11)], {}, {}),
(8, 9), (9, 12), (9, 13), (8, 10), (8, 11)], {}, {}, {}),
# Toluene like test case with squash operator and aromaticity
("{[#SC3]1[#TC5][#TC5]1}.{#SC3=Cc(c[!])c[!],#TC5=[!]ccc[!]}",
[('SC3', 'C C H H H C H C H'),
('TC5', 'C H C H C H')],
'C C H H H C H C H C H C H C H',
[(0, 1), (0, 2), (0, 3), (0, 4), (1, 5),
(1, 7), (5, 9), (5, 6), (7, 13), (7, 8),
(9, 11), (9, 10), (11, 13), (11, 12), (13, 14)], {}, {}),
(9, 11), (9, 10), (11, 13), (11, 12), (13, 14)], {}, {}, {}),
# simple chirality assigment with rings
("{[#GLC]}.{#GLC=C([C@@H]1[C@H]([C@@H]([C@H]([C@H](O1)O)O)O)O)O}",
# 0 1 2 3
Expand All @@ -194,7 +194,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
(2, 15), (3, 4), (3, 9), (3, 16), (4, 5), (4, 8), (4, 17), (5, 6), (5, 7), (5, 18),
(7, 19), (8, 20), (9, 21), (10, 22), (11, 23)],
{1: (6, 14, 2, 0), 2: (1, 15, 3, 10), 3: (2, 16, 9, 4),
4: (3, 17, 5, 8), 5: (4, 18, 6, 7)}, {}),
4: (3, 17, 5, 8), 5: (4, 18, 6, 7)}, {}, {}),
# simple chirality assigment between fragments
("{[#A][#B][#C]}.{#A=O[>],#C=O[<],#B=[<]C[C@H][>]C(=O)OC}",
# 0 1 2 3
Expand All @@ -204,7 +204,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
[(0, 1), (0, 2), (2, 3), (2, 4),
(2, 5), (5, 6), (5, 7), (7, 8), (7, 9), (9, 10), (10, 11), (10, 12),
(10, 13), (5, 14), (14, 15)],
{3: (2, 10, 4, 14)}, {}),
{3: (2, 10, 4, 14)}, {}, {}),
# simple chirality assigment between fragments inv
("{[#A][#B][#C]}.{#A=O[>],#C=O[<],#B=[<]C[C@@H][>]C(=O)OC}",
# 0 1 2 3
Expand All @@ -214,21 +214,21 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
[(0, 1), (0, 2), (2, 3), (2, 4),
(2, 5), (5, 6), (5, 7), (7, 8), (7, 9), (9, 10), (10, 11), (10, 12),
(10, 13), (5, 14), (14, 15)],
{3: (2, 10, 14, 4)}, {}),
{3: (2, 10, 14, 4)}, {}, {}),
# smiple ez isomerism assigment between fragments inv
("{[#A][#B]}.{#A=CC(/F)=[$],#B=[$]=C(\F)C}",
[('A', 'C C F H H H'), ('B', 'C F C H H H')],
'C C F H H H F C C H H H',
[(0, 1), (1, 2), (0, 3), (0, 4),
(0, 5), (1, 7), (7, 6), (7, 8), (8, 9), (8, 10), (8, 11)],
{}, {2: (2, 1, 6, 7, 'trans'), 7: (7, 6, 1, 2, 'trans')}),
{}, {2: (2, 1, 6, 7, 'trans'), 7: (7, 6, 1, 2, 'trans')}, {}),
# simple ez isomerism assigment between fragments inv
("{[#A][#B]}.{#A=CC(/F)=[$],#B=[$]=C(/F)C}",
[('A', 'C C F H H H'), ('B', 'C F C H H H')],
'C C F H H H F C C H H H',
[(0, 1), (1, 2), (0, 3), (0, 4),
(0, 5), (1, 7), (7, 6), (7, 8), (8, 9), (8, 10), (8, 11)],
{}, {2: (2, 1, 6, 7, 'cis'), 7: (7, 6, 1, 2, 'cis')}),
{}, {2: (2, 1, 6, 7, 'cis'), 7: (7, 6, 1, 2, 'cis')}, {}),
# test skip virtual nodes
("{[#SP4]1.2[#SP4].3[#SP1r]1.[#TC4]23}.{#SP4=OC[$]C[$]O,#SP1r=[$]OC[$]CO}",
[('SP4', 'O C C O H H H H'), ('SP4', 'O C C O H H H H'),
Expand All @@ -238,9 +238,29 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
(3, 7), (8, 9), (8, 12), (9, 10), (9, 13), (10, 11), (10, 17),
(10, 14), (11, 15), (16, 17), (17, 18), (17, 20), (18, 19),
(18, 21), (18, 22), (19, 23)],
{},{}),
{},{}, {}),
# test weights
("{[#SP4]1[#SP4][#SP1r]1}.{#SP4=[OH;0.5]C[$]C[$]O,#SP1r=[$]OC[$]CO}",
[('SP4', 'O C C O H H H H'), ('SP4', 'O C C O H H H H'),
('SP1r', 'O C C O H H H H')],
'O C C O H H H H O C C O H H H H O C C O H H H H',
[(0, 1), (0, 4), (1, 2), (1, 9), (1, 5), (2, 3), (2, 16), (2, 6),
(3, 7), (8, 9), (8, 12), (9, 10), (9, 13), (10, 11), (10, 17),
(10, 14), (11, 15), (16, 17), (17, 18), (17, 20), (18, 19),
(18, 21), (18, 22), (19, 23)],
{},{}, {0: 0.5, 4: 0.5, 8: 0.5, 12: 0.5}),
# test 2 weights
("{[#SP4]1[#SP4][#SP1r]1}.{#SP4=[OH;0.5][C;0.1][$]C[$]O,#SP1r=[$]OC[$]CO}",
[('SP4', 'O C C O H H H H'), ('SP4', 'O C C O H H H H'),
('SP1r', 'O C C O H H H H')],
'O C C O H H H H O C C O H H H H O C C O H H H H',
[(0, 1), (0, 4), (1, 2), (1, 9), (1, 5), (2, 3), (2, 16), (2, 6),
(3, 7), (8, 9), (8, 12), (9, 10), (9, 13), (10, 11), (10, 17),
(10, 14), (11, 15), (16, 17), (17, 18), (17, 20), (18, 19),
(18, 21), (18, 22), (19, 23)],
{},{}, {0: 0.5, 1: 0.1, 5: 0.1, 4: 0.5, 8: 0.5, 9: 0.1, 12: 0.5, 13: 0.1}),
))
def test_all_atom_resolve_molecule(smile, ref_frags, elements, ref_edges, chiral, ez):
def test_all_atom_resolve_molecule(smile, ref_frags, elements, ref_edges, chiral, ez, weights):
meta_mol, molecule = MoleculeResolver.from_string(smile).resolve()

# loop and compare fragments first
Expand Down Expand Up @@ -278,6 +298,12 @@ def _ele_match(n1, n2):
if ez:
ez_assigned = nx.get_node_attributes(molecule, 'ez_isomer')
assert ez == ez_assigned
# check weights
if weights:
mol_weights = {node: 1 for node in ref_graph}
mol_weights.update(weights)
weights_assigned = nx.get_node_attributes(molecule, 'weight')
assert mol_weights == weights_assigned

@pytest.mark.parametrize('case, cgsmiles_str, ref_string',(
# case 1: here only the meta-graph is described by the
Expand Down

0 comments on commit d627ac2

Please sign in to comment.