Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[1] Squash opr #1

Merged
merged 20 commits into from
May 6, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions cgsmiles/graph_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,9 @@ def sort_nodes_by_attr(graph, sort_attr="fragid"):
nx.Graph
graph with nodes sorted in correct order
"""
fragids = nx.get_node_attributes(graph, "fragid")
sorted_ids = sorted(fragids.items(), key=lambda item: (item[1], item[0]))
print(sorted_ids)
mapping = {old[0]: new for new, old in enumerate(sorted_ids)}
attr_values = nx.get_node_attributes(graph, sort_attr)
sorted_ids = sorted(attr_values, key=lambda item: (attr_values[item], item))
mapping = {old: new for new, old in enumerate(sorted_ids)}
new_graph = nx.relabel_nodes(graph, mapping, copy=True)
return new_graph

Expand Down
4 changes: 2 additions & 2 deletions cgsmiles/pysmiles_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ def rebuild_h_atoms(mol_graph, keep_bonding=False):
"""
Helper function which add hydrogen atoms to the molecule graph.

First the hcount attribute produced by pysmiles us updated, because
First the hcount attribute produced by pysmiles is updated, because
fragments have no bonds at time of reading so pysmiles does not
know the connectivity. Hence the hcount is redone based on the
actual connectivity of the final molecule.
Expand Down Expand Up @@ -37,7 +37,7 @@ def rebuild_h_atoms(mol_graph, keep_bonding=False):
bonds = round(sum([mol_graph.edges[(node, neigh)]['order'] for neigh in\
mol_graph.neighbors(node)]))
charge = mol_graph.nodes[node].get('charge', 0)
hcount = pysmiles.smiles_helper.VALENCES[ele][0] -\
hcount = pysmiles.smiles_helper._valence(mol_graph, node, minimum=0) -\
bonds +\
charge
# in this case we only rebuild hydrogen atoms that are not
Expand Down
46 changes: 43 additions & 3 deletions cgsmiles/read_fragments.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,35 @@
import pysmiles
from .read_cgsmiles import read_cgsmiles

class PeekIter(object):
"""
Custom iter that allows looking ahead, without
advancing the actual iter.
"""
def __init__(self, collection):
self.collection = collection
self.index = 0

def __next__(self):
try:
result = self.collection[self.index]
self.index += 1
except IndexError:
raise StopIteration
return result

def peek(self):
try:
result = self.collection[self.index]
except IndexError:
return ""
return result

def __iter__(self):
self.index = 0
return self
fgrunewald marked this conversation as resolved.
Show resolved Hide resolved


def strip_bonding_descriptors(fragment_string):
"""
Processes a CGBigSmile fragment string by
Expand All @@ -27,11 +56,13 @@ def strip_bonding_descriptors(fragment_string):
a dict mapping bonding descriptors
to the nodes within the string
"""
smile_iter = iter(fragment_string)
bond_to_order = {'-': 1, '=': 2, '#': 3, '$': 4, ':': 1.5, '.': 0}
smile_iter = PeekIter(fragment_string)
bonding_descrpt = defaultdict(list)
smile = ""
node_count = 0
prev_node = 0
current_order = None
for token in smile_iter:
if token == '[':
peek = next(smile_iter)
Expand All @@ -41,7 +72,14 @@ def strip_bonding_descriptors(fragment_string):
while peek != ']':
bond_descrp += peek
peek = next(smile_iter)
bonding_descrpt[prev_node].append(bond_descrp)
if smile_iter.peek() in bond_to_order:
order = bond_to_order[next(smile_iter)]
elif current_order:
order = current_order
current_order = None
else:
order = 1
bonding_descrpt[prev_node].append(bond_descrp + str(order))
fgrunewald marked this conversation as resolved.
Show resolved Hide resolved
else:
atom = token
while peek != ']':
Expand All @@ -59,8 +97,10 @@ def strip_bonding_descriptors(fragment_string):
elif token == ')':
prev_node = anchor
smile += token
elif token in bond_to_order:
current_order = bond_to_order[token]
else:
if token not in '] H @ . - = # $ : / \\ + - %'\
if token not in '] H @ $ / \\ + - %'\
and not token.isdigit():
prev_node = node_count
node_count += 1
Expand Down
9 changes: 3 additions & 6 deletions cgsmiles/resolve.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def compatible(left, right):
-------
bool
"""
if left == right and left not in '> <':
if left == right and left[0] not in '> <':
return True
l, r = left[0], right[0]
if (l, r) == ('<', '>') or (l, r) == ('>', '<'):
Expand Down Expand Up @@ -177,9 +177,7 @@ def edges_from_bonding_descrpt(self):

# bonding descriptors are assumed to have bonding order 1
# unless they are specifically annotated
order = re.findall("\d+\.\d+", bonding[0])
if not order:
order = 1
order = int(bonding[0][-1])
fgrunewald marked this conversation as resolved.
Show resolved Hide resolved
self.molecule.add_edge(edge[0], edge[1], bonding=bonding, order=order)

def squash_atoms(self):
Expand All @@ -202,8 +200,7 @@ def squash_atoms(self):
self_loops=False)

# add the fragment id of the sequashed node
self.molecule.nodes[node_to_keep]['fragid'] +=\
self.molecule.nodes[node_to_keep]['contraction'][node_to_remove]['fragid']
self.molecule.nodes[node_to_keep]['fragid'] += self.molecule.nodes[node_to_keep]['contraction'][node_to_remove]['fragid']

def resolve(self):

Expand Down
56 changes: 30 additions & 26 deletions cgsmiles/tests/test_cgsmile_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,31 +127,35 @@ def test_read_cgsmiles(smile, nodes, edges):
# smiple symmetric bonding
("[$]COC[$]",
"COC",
{0: ["$"], 2: ["$"]}),
{0: ["$1"], 2: ["$1"]}),
# smiple symmetric bonding with more than one name
("[$1A]COC[$1A]",
"COC",
{0: ["$1A1"], 2: ["$1A1"]}),
# simple symmetric but with explicit hydrogen
("[$][CH2]O[CH2][$]",
"[CH2]O[CH2]",
{0: ["$"], 2: ["$"]}),
{0: ["$1"], 2: ["$1"]}),
# smiple symmetric bonding; multiple descript
("[$]COC[$][$1]",
"COC",
{0: ["$"], 2: ["$", "$1"]}),
{0: ["$1"], 2: ["$1", "$11"]}),
# named different bonding descriptors
("[$1]CCCC[$2]",
"CCCC",
{0: ["$1"], 3: ["$2"]}),
{0: ["$11"], 3: ["$21"]}),
# ring and bonding descriptors
("[$1]CC[$2]C1CCCCC1",
"CCC1CCCCC1",
{0: ["$1"], 1: ["$2"]}),
{0: ["$11"], 1: ["$21"]}),
# bonding descript. after branch
("C(COC[$1])[$2]CCC[$3]",
"C(COC)CCC",
{0: ["$2"], 3: ["$1"], 6: ["$3"]}),
{0: ["$21"], 3: ["$11"], 6: ["$31"]}),
# left rigth bonding desciptors
("[>]COC[<]",
"COC",
{0: [">"], 2: ["<"]})
{0: [">1"], 2: ["<1"]})
))
def test_strip_bonding_descriptors(big_smile, smile, bonding):
new_smile, new_bonding = strip_bonding_descriptors(big_smile)
Expand All @@ -161,50 +165,50 @@ def test_strip_bonding_descriptors(big_smile, smile, bonding):
@pytest.mark.parametrize('fragment_str, nodes, edges',(
# single fragment
("{#PEO=[$]COC[$]}",
{"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$"], "element": "C"}),
(1, {"atomname": "O1", "fragname": "PEO", "element": "O"}),
(2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$"], "element": "C"}),
{"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3}),
(1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0}),
(2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3}),
)},
{"PEO": [(0, 1), (1, 2)]}),
# single fragment but with explicit hydrogen in smiles
("{#PEO=[$][CH2]O[CH2][$]}",
{"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$"], "element": "C"}),
(1, {"atomname": "O1", "fragname": "PEO", "element": "O"}),
(2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$"], "element": "C"}),
{"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2}),
(1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0}),
(2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2}),
)},
{"PEO": [(0, 1), (1, 2)]}),
# test NH3 terminal
("{#AMM=N[$]}",
{"AMM": ((0, {"atomname": "N0", "fragname": "AMM", "bonding": ["$"], "element": "N"}),
{"AMM": ((0, {"atomname": "N0", "fragname": "AMM", "bonding": ["$1"], "element": "N", "hcount": 3}),
)},
{"AMM": []}),
# single fragment + 1 terminal (i.e. only 1 bonding descrpt
("{#PEO=[$]COC[$],#OHter=[$][OH]}",
{"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$"], "element": "C"}),
(1, {"atomname": "O1", "fragname": "PEO", "element": "O"}),
(2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$"], "element": "C"}),
{"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3}),
(1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0}),
(2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3}),
),
"OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$"], "element": "O"}),)},
"OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O"}),)},
{"PEO": [(0, 1), (1, 2)],
"OHter": []}),
# single fragment + 1 terminal but multiple bond descritp.
# this adjust the hydrogen count
("{#PEO=[$]COC[$][$1],#OHter=[$][OH]}",
{"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$"], "element": "C"}),
(1, {"atomname": "O1", "fragname": "PEO", "element": "O"}),
(2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$", "$1"], "element": "C"}),
{"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3}),
(1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0}),
(2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1", "$11"], "element": "C", "hcount": 3}),
),
"OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$"], "element": "O"}),)},
"OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "hcount": 1}),)},
{"PEO": [(0, 1), (1, 2)],
"OHter": []}),
# single fragment + 1 terminal but multiple bond descritp.
# but explicit hydrogen in the smiles string
("{#PEO=[$][CH2]O[CH2][$][$1],#OHter=[$][OH]}",
{"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$"], "element": "C"}),
(1, {"atomname": "O1", "fragname": "PEO", "element": "O"}),
(2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$", "$1"], "element": "C"}),
{"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2}),
(1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0}),
(2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1", "$11"], "element": "C", "hcount": 2}),
),
"OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$"], "element": "O"}),
"OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "hcount": 1}),
)},
{"PEO": [(0, 1), (1, 2),],
"OHter": []}),
Expand Down
33 changes: 31 additions & 2 deletions cgsmiles/tests/test_molecule_resolve.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@
{0: ['>'], 1: ['$5']},
(3, 0),
('<', '>')),
# left right with annotated > <
({0: ['$'], 1: ['>1'], 3: ['<1']},
{0: ['>1'], 1: ['$5']},
(3, 0),
('<1', '>1')),
# left right selective bonding
# with identifier
({0: ['$'], 1: ['>'], 3: ['<1']},
Expand Down Expand Up @@ -53,6 +58,16 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes):
[(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7),
(4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13),
(11, 14), (11, 15), (11, 16), (16, 17)]),
# smiple linear seqeunce with bond-order in link
("{[#TC1][#TC4][#TC1]}.{#TC1=[$1]=CC=[$2],#TC4=[$1]=CC=[$2]}",
fgrunewald marked this conversation as resolved.
Show resolved Hide resolved
# 0 1 2 3 4 5 6 7 8 9
[('TC1', 'C C H H H H'), ('TC4', 'C C H H'),
# 10 11 12 13 14 15
('TC1', 'C C H H H H')],
'C C H H H H C C H H C C H H H H',
[(0, 1), (0, 2), (1, 3), (1, 4), (1, 5), (0, 6), (6, 7),
(6, 8), (7, 9), (7, 11), (10, 11), (10, 12), (10, 13),
(10, 14), (11, 15)]),
# smiple linear seqeunce unconsumed bonding descrpt
("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]CO[>]C[$],#OHter=[$][O]}",
# 0 1 2 3 4 5 6 7 8
Expand All @@ -73,6 +88,16 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes):
[(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7),
(4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13),
(11, 14), (11, 15), (11, 16), (16, 17)]),
# smiple linear seqeunce with ionic ending
("{[#OH][#PEO]|2[#ON]}.{#PEO=[$]COC[$],#OH=[$]O,#ON=[$][O-]}",
# 0 1 2 3 4 5 6 7 8
[('OH', 'O H'), ('PEO', 'C O C H H H H'),
# 9 10 11 12 13 14 15 16 17
('PEO', 'C O C H H H H'), ('ON', 'O')],
'O H C O C H H H H C O C H H H H O',
[(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7),
(4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13),
(11, 14), (11, 15), (11, 16)]),
# uncomsumed bonding IDs; note that this is not the same
# molecule as previous test case. Here one of the OH branches
# and replaces an CH2 group with CH-OH
Expand All @@ -82,9 +107,9 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes):
# 9 10 11 12 13 14 15 16 17
('PEO', 'C O C H H H H'), ('OHter', 'O H')],
'O H C O C H H H H C O C H H H H O H',
[(0, 1), (0, 2), (2, 3), (2, 5), (2, 9), (3, 4),
[(0, 1), (0, 2), (2, 3), (2, 5), (2, 11), (3, 4),
(4, 6), (4, 7), (4, 8), (9, 10), (9, 12), (9, 13),
(10, 11), (11, 15), (11, 14), (11, 16), (16, 17)]),
(10, 11), (11, 15), (11, 14), (9, 16), (16, 17)]),
# simple branched sequence
("{[#Hter][#PE]([#PEO][#Hter])[#PE]([#PEO][#Hter])[#Hter]}.{#Hter=[$]H,#PE=[$]CC[$][$],#PEO=[$]COC[$]}",
[('Hter', 'H'), ('PE', 'C C H H H'), ('PEO', 'C O C H H H H'), ('Hter', 'H'),
Expand Down Expand Up @@ -173,5 +198,9 @@ def test_all_atom_resolve_molecule(smile, ref_frags, elements, ref_edges):
def _ele_match(n1, n2):
return n1["element"] == n2["element"]

print(smile)
print(ref_graph.edges)
print(molecule.edges)
assert ref_graph.edges == molecule.edges
# check that reference graph and molecule are isomorphic
assert nx.is_isomorphic(ref_graph, molecule, node_match=_ele_match)
Loading