Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[1] Squash opr #1

Merged
merged 20 commits into from
May 6, 2024
Merged
Changes from 15 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 21 additions & 21 deletions cgsmiles/graph_utils.py
Original file line number Diff line number Diff line change
@@ -37,13 +37,13 @@ def merge_graphs(source_graph, target_graph, max_node=None):
# We assume that the last id is always the largest.
last_node_idx = max_node
offset = last_node_idx
fragment_offset = source_graph.nodes[last_node_idx].get('fragid', 0)
fragment_offset = max(source_graph.nodes[last_node_idx].get('fragid', [0])) + 1

correspondence = {}
for idx, node in enumerate(target_graph.nodes(), start=offset + 1):
correspondence[node] = idx
new_atom = copy.copy(target_graph.nodes[node])
new_atom['fragid'] = (new_atom.get('fragid', 0) + fragment_offset)
new_atom['fragid'] = [(new_atom.get('fragid', 0) + fragment_offset)]
source_graph.add_node(idx, **new_atom)

for node1, node2 in target_graph.edges:
@@ -53,31 +53,28 @@ def merge_graphs(source_graph, target_graph, max_node=None):

return correspondence

def sort_nodes(graph, sortby_attrs=("fragid", "atomid"), target_attr=None):
def sort_nodes_by_attr(graph, sort_attr="fragid"):
"""
Sort nodes in graph by multiple attributes.
Sort nodes in graph by attribute and relable the graph in place.

Parameters
----------
graph: :class:`nx.Graph`
the graph to sort nodes of
sortby_attrs: tuple(str)
the attributes and in which order to sort
target_attr: `abc.hashable`
if not None indices are assigned to this
attribute starting at 1
sort_attr: `abc.hashable`
the attribute to use for sorting

Returns
-------
nx.Graph
graph with nodes sorted in correct order
"""
node_order = sorted(graph, key=partial(_keyfunc, graph, attrs=sortby_attrs))
for new_idx, node_key in enumerate(node_order, 1):
# graph._node.move_to_end(node_key)
if target_attr is not None:
graph.nodes[node_key][target_attr] = new_idx
return graph
fragids = nx.get_node_attributes(graph, "fragid")
sorted_ids = sorted(fragids.items(), key=lambda item: (item[1], item[0]))
print(sorted_ids)
mapping = {old[0]: new for new, old in enumerate(sorted_ids)}
new_graph = nx.relabel_nodes(graph, mapping, copy=True)
fgrunewald marked this conversation as resolved.
Show resolved Hide resolved
return new_graph

def _keyfunc(graph, node_idx, attrs):
"""
@@ -95,23 +92,26 @@ def annotate_fragments(meta_graph, molecule):
"""
node_to_fragids = nx.get_node_attributes(molecule, 'fragid')

fragids = defaultdict(list)
for fragid, node in node_to_fragids.items():
fragids[fragid].append(node)
fragid_to_node = defaultdict(list)
for node, fragids in node_to_fragids.items():
for fragid in fragids:
fragid_to_node[fragid].append(node)

for meta_node in meta_graph.nodes:
# adding node to the fragment graph
graph_frag = nx.Graph()
for node in fragids[meta_node]:
for node in fragid_to_node[meta_node]:
attrs = molecule.nodes[node]
graph_frag.add_node(node, **attrs)

# adding the edges
# this is slow but OK; we always assume that the fragment
# is much much smaller than the fullblown graph
combinations = itertools.combinations(fragids[meta_node], r=2)
combinations = itertools.combinations(fragid_to_node[meta_node], r=2)
for a, b in combinations:
if molecule.has_edge(a, b):
graph.add_edge(a, b)
graph_frag.add_edge(a, b)
fgrunewald marked this conversation as resolved.
Show resolved Hide resolved

meta_graph.nodes[meta_node]['graph'] = graph_frag

return meta_graph
58 changes: 58 additions & 0 deletions cgsmiles/pysmiles_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import pysmiles

VALENCES = pysmiles.smiles_helper.VALENCES
VALENCES.update({"H": (1,)})

def rebuild_h_atoms(mol_graph, keep_bonding=False):
"""
Helper function which add hydrogen atoms to the molecule graph.

First the hcount attribute produced by pysmiles us updated, because
fgrunewald marked this conversation as resolved.
Show resolved Hide resolved
fragments have no bonds at time of reading so pysmiles does not
know the connectivity. Hence the hcount is redone based on the
actual connectivity of the final molecule.

This function also makes sure to tag single hydrogen fragments,
in order to not merge them with adjecent fragments. Otherwise,
explicit single hydrogen residues would not be possible.

The molecule graph is updated in place with the hydrogen atoms
that are missing.

Using the keep_bonding argument the hydrogen count is reduced
by the number of bonding descriptors. In this way hydrogen
atoms can also be added to fragments only.

Parameters
----------
mol_graph: :class:`nx.Graph`
graph describing the full molecule without hydrogen atoms
"""
for node in mol_graph.nodes:
if mol_graph.nodes[node].get('bonding', False):
# get the degree
ele = mol_graph.nodes[node]['element']
# hcount is the valance minus the degree minus
# the number of bonding descriptors
bonds = round(sum([mol_graph.edges[(node, neigh)]['order'] for neigh in\
mol_graph.neighbors(node)]))
charge = mol_graph.nodes[node].get('charge', 0)
hcount = pysmiles.smiles_helper.VALENCES[ele][0] -\
fgrunewald marked this conversation as resolved.
Show resolved Hide resolved
bonds +\
charge
# in this case we only rebuild hydrogen atoms that are not
# replaced by bonding operators.
if keep_bonding:
hcount -= len(mol_graph.nodes[node]['bonding'])

mol_graph.nodes[node]['hcount'] = hcount
fgrunewald marked this conversation as resolved.
Show resolved Hide resolved
if ele == "H":
mol_graph.nodes[node]['single_h_frag'] = True
fgrunewald marked this conversation as resolved.
Show resolved Hide resolved

pysmiles.smiles_helper.add_explicit_hydrogens(mol_graph)
for node in mol_graph.nodes:
if mol_graph.nodes[node].get("element", "*") == "H" and\
not mol_graph.nodes[node].get("single_h_frag", False):
ref_node = next(mol_graph.neighbors(node))
mol_graph.nodes[node]["fragid"] = mol_graph.nodes[ref_node]["fragid"]
mol_graph.nodes[node]["fragname"] = mol_graph.nodes[ref_node]["fragname"]
37 changes: 1 addition & 36 deletions cgsmiles/read_fragments.py
Original file line number Diff line number Diff line change
@@ -35,7 +35,7 @@ def strip_bonding_descriptors(fragment_string):
for token in smile_iter:
if token == '[':
peek = next(smile_iter)
if peek in ['$', '>', '<']:
if peek in ['$', '>', '<', '!']:
bond_descrp = peek
peek = next(smile_iter)
while peek != ']':
@@ -67,39 +67,6 @@ def strip_bonding_descriptors(fragment_string):
smile += token
return smile, bonding_descrpt

def _rebuild_h_atoms(mol_graph):
# special hack around to fix
# pysmiles bug for a single
# atom molecule; we assume that the
# hcount is just wrong and set it to
# the valance number minus bonds minus
# bonding connectors
if len(mol_graph.nodes) == 1:
ele = mol_graph.nodes[0]['element']
# for N and P we assume the regular valency
hcount = pysmiles.smiles_helper.VALENCES[ele][0]
if mol_graph.nodes[0].get('bonding', False):
hcount -= 1
mol_graph.nodes[0]['hcount'] = hcount
else:
for node in mol_graph.nodes:
if mol_graph.nodes[node].get('bonding', False):
# get the degree
ele = mol_graph.nodes[node]['element']
# hcount is the valance minus the degree minus
# the number of bonding descriptors
bonds = round(sum([mol_graph.edges[(node, neigh)]['order'] for neigh in\
mol_graph.neighbors(node)]))
charge = mol_graph.nodes[node].get('charge', 0)
hcount = pysmiles.smiles_helper.VALENCES[ele][0] -\
bonds -\
len(mol_graph.nodes[node]['bonding']) +\
charge
mol_graph.nodes[node]['hcount'] = hcount

pysmiles.smiles_helper.add_explicit_hydrogens(mol_graph)
return mol_graph

def fragment_iter(fragment_str, all_atom=True):
"""
Iterates over fragments defined in a CGBigSmile string.
@@ -137,8 +104,6 @@ def fragment_iter(fragment_str, all_atom=True):
elif all_atom:
mol_graph = pysmiles.read_smiles(smile)
nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding')
# we need to rebuild hydrogen atoms now
_rebuild_h_atoms(mol_graph)
# we deal with a CG resolution graph
else:
mol_graph = read_cgsmiles(smile)
101 changes: 54 additions & 47 deletions cgsmiles/resolve.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
import re
import copy
import networkx as nx
import pysmiles
from .read_cgsmiles import read_cgsmiles
from .read_fragments import read_fragments
from .graph_utils import merge_graphs, sort_nodes, annotate_fragments

VALENCES = pysmiles.smiles_helper.VALENCES
VALENCES.update({"H": (1,)})
from .graph_utils import merge_graphs, sort_nodes_by_attr, annotate_fragments
from .pysmiles_utils import rebuild_h_atoms

def compatible(left, right):
"""
@@ -129,6 +128,10 @@ def __init__(self,

def resolve_disconnected_molecule(self):
"""
Given a connected graph of nodes with associated fragment graphs
generate a disconnected graph of the fragments and annotate
each fragment graph to the node in the higher resolution
graph.
"""
for meta_node in self.meta_graph.nodes:
fragname = self.meta_graph.nodes[meta_node]['fragname']
@@ -139,19 +142,20 @@ def resolve_disconnected_molecule(self):

for node in fragment.nodes:
new_node = correspondence[node]
attrs = self.molecule.nodes[new_node]
attrs = copy.deepcopy(self.molecule.nodes[new_node])
graph_frag.add_node(correspondence[node], **attrs)
nx.set_node_attributes(graph_frag, meta_node, 'fragid')
nx.set_node_attributes(graph_frag, [meta_node], 'fragid')

for a, b in fragment.edges:
new_a = correspondence[a]
new_b = correspondence[b]
attrs = copy.deepcopy(fragment.edges[(a, b)])
graph_frag.add_edge(new_a,
new_b)
new_b,
**attrs)

self.meta_graph.nodes[meta_node]['graph'] = graph_frag


def edges_from_bonding_descrpt(self):
"""
Make edges according to the bonding descriptors stored
@@ -167,64 +171,67 @@ def edges_from_bonding_descrpt(self):
edge, bonding = generate_edge(prev_graph,
node_graph)

#To DO - clean copying of bond-list attribute
# this is a bit of a workaround because at this stage the
# bonding list is actually shared between all residues of
# of the same type; so we first make a copy then we replace
# the list sans used bonding descriptor
prev_bond_list = prev_graph.nodes[edge[0]]['bonding'].copy()
prev_bond_list.remove(bonding[0])
prev_graph.nodes[edge[0]]['bonding'] = prev_bond_list
node_bond_list = node_graph.nodes[edge[1]]['bonding'].copy()
node_bond_list.remove(bonding[1])
node_graph.nodes[edge[1]]['bonding'] = node_bond_list
order = re.findall("\d+\.\d+", bonding[0])
# remove used bonding descriptors
prev_graph.nodes[edge[0]]['bonding'].remove(bonding[0])
node_graph.nodes[edge[1]]['bonding'].remove(bonding[1])

# bonding descriptors are assumed to have bonding order 1
# unless they are specifically annotated
order = re.findall("\d+\.\d+", bonding[0])
fgrunewald marked this conversation as resolved.
Show resolved Hide resolved
if not order:
order = 1
self.molecule.add_edge(edge[0], edge[1], bonding=bonding, order=order)

def replace_unconsumed_bonding_descrpt(self):
def squash_atoms(self):
"""
We allow multiple bonding descriptors per atom, which
however, are not always consumed. In this case the left
over bonding descriptors are replaced by hydrogen atoms.
Applies the squash operator by removing the duplicate node
adding, all edges from that node to the remaining one, and
annotate the other node with the fragid of the removed
node.
"""
for meta_node in self.meta_graph.nodes:
graph = self.meta_graph.nodes[meta_node]['graph']
bonding = nx.get_node_attributes(graph, "bonding")
for node, bondings in bonding.items():
if bondings:
element = graph.nodes[node]['element']
bonds = round(sum([self.molecule.edges[(node, neigh)]['order'] for neigh in\
self.molecule.neighbors(node)]))
hcount = VALENCES[element][0] - bonds
attrs = {attr: graph.nodes[node][attr] for attr in ['fragname', 'fragid']}
attrs['element'] = 'H'
for _ in range(0, hcount):
new_node = len(self.molecule.nodes) + 1
graph.add_edge(node, new_node)
attrs['atomname'] = "H" + str(len(graph.nodes)-1)
graph.nodes[new_node].update(attrs)
self.molecule.add_edge(node, new_node, order=1)
self.molecule.nodes[new_node].update(attrs)
# now we want to sort the atoms
sort_nodes(self.molecule)
# and redo the meta molecule
self.meta_graph = annotate_fragments(self.meta_graph, self.molecule)
bondings = nx.get_edge_attributes(self.molecule, 'bonding')
squashed = False
for edge, bonding in bondings.items():
if not bonding[0].startswith('!'):
continue
# let's squash two nodes
node_to_keep, node_to_remove = edge
self.molecule = nx.contracted_nodes(self.molecule,
node_to_keep,
node_to_remove,
self_loops=False)

# add the fragment id of the sequashed node
self.molecule.nodes[node_to_keep]['fragid'] +=\
self.molecule.nodes[node_to_keep]['contraction'][node_to_remove]['fragid']
fgrunewald marked this conversation as resolved.
Show resolved Hide resolved

def resolve(self):

if self.cgsmiles_string is not None:
self.meta_graph = read_cgsmiles(self.cgsmiles_string)

if self.fragment_string is not None:
self.fragment_dict.update(read_fragments(self.fragment_string,
all_atom=self.all_atom))

# add disconnected fragments to graph
self.resolve_disconnected_molecule()

# connect valid bonding descriptors
self.edges_from_bonding_descrpt()

# contract atoms with squash descriptors
self.squash_atoms()

# rebuild hydrogen in all-atom case
if self.all_atom:
self.replace_unconsumed_bonding_descrpt()
rebuild_h_atoms(self.molecule)

# sort the atoms
self.molecule = sort_nodes_by_attr(self.molecule, sort_attr=("fragid"))
fgrunewald marked this conversation as resolved.
Show resolved Hide resolved

# and redo the meta molecule
self.meta_graph = annotate_fragments(self.meta_graph,
self.molecule)

return self.meta_graph, self.molecule
46 changes: 12 additions & 34 deletions cgsmiles/tests/test_cgsmile_parsing.py
Original file line number Diff line number Diff line change
@@ -164,72 +164,50 @@ def test_strip_bonding_descriptors(big_smile, smile, bonding):
{"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$"], "element": "C"}),
(1, {"atomname": "O1", "fragname": "PEO", "element": "O"}),
(2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$"], "element": "C"}),
(3, {"atomname": "H3", "fragname": "PEO", "element": "H"}),
(4, {"atomname": "H4", "fragname": "PEO", "element": "H"}),
(5, {"atomname": "H5", "fragname": "PEO", "element": "H"}),
(6, {"atomname": "H6", "fragname": "PEO", "element": "H"}),
)},
{"PEO": [(0, 1), (1, 2), (0, 3), (0, 4), (2, 5), (2, 6)]}),
{"PEO": [(0, 1), (1, 2)]}),
# single fragment but with explicit hydrogen in smiles
("{#PEO=[$][CH2]O[CH2][$]}",
{"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$"], "element": "C"}),
(1, {"atomname": "O1", "fragname": "PEO", "element": "O"}),
(2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$"], "element": "C"}),
(3, {"atomname": "H3", "fragname": "PEO", "element": "H"}),
(4, {"atomname": "H4", "fragname": "PEO", "element": "H"}),
(5, {"atomname": "H5", "fragname": "PEO", "element": "H"}),
(6, {"atomname": "H6", "fragname": "PEO", "element": "H"}),
)},
{"PEO": [(0, 1), (1, 2), (0, 3), (0, 4), (2, 5), (2, 6)]}),
{"PEO": [(0, 1), (1, 2)]}),
# test NH3 terminal
("{#AMM=N[$]}",
{"AMM": ((0, {"atomname": "N0", "fragname": "AMM", "bonding": ["$"], "element": "N"}),
(1, {"atomname": "H1", "fragname": "AMM", "element": "H"}),
(2, {"atomname": "H2", "fragname": "AMM", "element": "H"}),
)},
{"AMM": [(0, 1), (0, 2)]}),
{"AMM": []}),
# single fragment + 1 terminal (i.e. only 1 bonding descrpt
("{#PEO=[$]COC[$],#OHter=[$][OH]}",
{"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$"], "element": "C"}),
(1, {"atomname": "O1", "fragname": "PEO", "element": "O"}),
(2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$"], "element": "C"}),
(3, {"atomname": "H3", "fragname": "PEO", "element": "H"}),
(4, {"atomname": "H4", "fragname": "PEO", "element": "H"}),
(5, {"atomname": "H5", "fragname": "PEO", "element": "H"}),
(6, {"atomname": "H6", "fragname": "PEO", "element": "H"}),
),
"OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$"], "element": "O"}),
(1, {"atomname": "H1", "fragname": "OHter", "element": "H"}))},
{"PEO": [(0, 1), (1, 2), (0, 3), (0, 4), (2, 5), (2, 6)],
"OHter": [(0, 1)]}),
"OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$"], "element": "O"}),)},
fgrunewald marked this conversation as resolved.
Show resolved Hide resolved
{"PEO": [(0, 1), (1, 2)],
"OHter": []}),
# single fragment + 1 terminal but multiple bond descritp.
# this adjust the hydrogen count
("{#PEO=[$]COC[$][$1],#OHter=[$][OH]}",
{"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$"], "element": "C"}),
(1, {"atomname": "O1", "fragname": "PEO", "element": "O"}),
(2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$", "$1"], "element": "C"}),
(3, {"atomname": "H3", "fragname": "PEO", "element": "H"}),
(4, {"atomname": "H4", "fragname": "PEO", "element": "H"}),
(5, {"atomname": "H5", "fragname": "PEO", "element": "H"}),
),
"OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$"], "element": "O"}),
(1, {"atomname": "H1", "fragname": "OHter", "element": "H"}))},
{"PEO": [(0, 1), (1, 2), (0, 3), (0, 4), (2, 5)],
"OHter": [(0, 1)]}),
"OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$"], "element": "O"}),)},
{"PEO": [(0, 1), (1, 2)],
"OHter": []}),
# single fragment + 1 terminal but multiple bond descritp.
# but explicit hydrogen in the smiles string
("{#PEO=[$][CH2]O[CH2][$][$1],#OHter=[$][OH]}",
{"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$"], "element": "C"}),
(1, {"atomname": "O1", "fragname": "PEO", "element": "O"}),
(2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$", "$1"], "element": "C"}),
(3, {"atomname": "H3", "fragname": "PEO", "element": "H"}),
(4, {"atomname": "H4", "fragname": "PEO", "element": "H"}),
(5, {"atomname": "H5", "fragname": "PEO", "element": "H"}),
),
"OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$"], "element": "O"}),
(1, {"atomname": "H1", "fragname": "OHter", "element": "H"}))},
{"PEO": [(0, 1), (1, 2), (0, 3), (0, 4), (2, 5)],
"OHter": [(0, 1)]}),
)},
{"PEO": [(0, 1), (1, 2),],
"OHter": []}),
))
def test_fragment_iter(fragment_str, nodes, edges):
82 changes: 68 additions & 14 deletions cgsmiles/tests/test_molecule_resolve.py
Original file line number Diff line number Diff line change
@@ -42,13 +42,14 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes):
assert new_btypes == btypes


@pytest.mark.parametrize('smile, ref_nodes, ref_edges',(
@pytest.mark.parametrize('smile, ref_frags, elements, ref_edges',(
# smiple linear seqeunce
("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]COC[$],#OHter=[$][O]}",
# 0 1 2 3 4 5 6 7 8
[('OHter', 'O H'), ('PEO', 'C O C H H H H'),
# 9 10 11 12 13 14 15 16 17
('PEO', 'C O C H H H H'), ('OHter', 'O H')],
'O H C O C H H H H C O C H H H H O H',
[(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7),
(4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13),
(11, 14), (11, 15), (11, 16), (16, 17)]),
@@ -58,6 +59,7 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes):
[('OHter', 'O H'), ('PEO', 'C O C H H H H'),
# 9 10 11 12 13 14 15 16 17
('PEO', 'C O C H H H H'), ('OHter', 'O H')],
'O H C O C H H H H C O C H H H H O H',
[(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7),
(4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13),
(11, 14), (11, 15), (11, 16), (16, 17)]),
@@ -67,23 +69,27 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes):
[('OHter', 'O Na'), ('PEO', 'C O C H H H H'),
# 9 10 11 12 13 14 15 16 17
('PEO', 'C O C H H H H'), ('OHter', 'O Na')],
'O Na C O C H H H H C O C H H H H O Na',
[(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7),
(4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13),
(11, 14), (11, 15), (11, 16), (16, 17)]),
# uncomsumed bonding IDs; note that this is not the same
# molecule as previous test case. Here one of the OH branches
# and replaces an CH2 group with CH-OH
("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[>][$1]COC[<],#OHter=[$1][O]}",
("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[>][$1A]COC[<],#OHter=[$1A][O]}",
# 0 1 2 3 4 5 6 7 8
[('OHter', 'O H'), ('PEO', 'C O C H H H H'),
# 9 10 11 12 13 14 15 16 17
('PEO', 'C O C H H H H'), ('OHter', 'O H')],
[(0, 1), (0, 2), (2, 3), (2, 5), (2, 10), (3, 4),
(4, 6), (4, 7), (4, 17), (8, 9), (8, 11), (8, 14),
(8, 18), (9, 10), (10, 12), (10, 13), (14, 15)]),
'O H C O C H H H H C O C H H H H O H',
[(0, 1), (0, 2), (2, 3), (2, 5), (2, 9), (3, 4),
(4, 6), (4, 7), (4, 8), (9, 10), (9, 12), (9, 13),
(10, 11), (11, 15), (11, 14), (11, 16), (16, 17)]),
# simple branched sequence
("{[#Hter][#PE]([#PEO][#Hter])[#PE]([#PEO][#Hter])[#Hter]}.{#Hter=[$]H,#PE=[$]CC[$][$],#PEO=[$]COC[$]}",
[('Hter', 'H'), ('PE', 'C C H H H'), ('PEO', 'C O C H H H H'), ('Hter', 'H'),
('PE', 'C C H H H'), ('PEO', 'C O C H H H H'), ('Hter', 'H'), ('Hter', 'H')],
'H C C H H H C O C H H H H H C C H H H C O C H H H H H H',
[(0, 1), (1, 2), (1, 3), (1, 4), (2, 5), (2, 6), (2, 14), (6, 7), (6, 9), (6, 10), (7, 8),
(8, 11), (8, 12), (8, 13), (14, 15), (14, 16), (14, 17), (15, 18), (15, 19), (15, 27),
(19, 20), (19, 22), (19, 23), (20, 21), (21, 24), (21, 25), (21, 26)]),
@@ -93,6 +99,7 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes):
("{[#Hter][#PS]|2[#Hter]}.{#PS=[$]CC[$]c1ccccc1,#Hter=[$]H}",
[('Hter', 'H'), ('PS', 'C C C C C C C C H H H H H H H H'),
('PS', 'C C C C C C C C H H H H H H H H'), ('Hter', 'H')],
'H C C C C C C C C H H H H H H H H C C C C C C C C H H H H H H H H H',
[(0, 1), (1, 2), (1, 9), (1, 10), (2, 3), (2, 11), (2, 17),
(3, 4), (3, 8), (4, 5), (4, 12), (5, 6), (5, 13), (6, 7),
(6, 14), (7, 8), (7, 15), (8, 16), (17, 18), (17, 25),
@@ -107,17 +114,64 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes):
# ('PE', 'C C H H H H H'), ('PE', 'C C H H H H'), ('PE', 'C C H H H'),
# ('PE', 'C C H H H H'), ('PE', 'C C H H H H H'), ('PE', 'C C H H H H H')]
# [,
# (
# (
# smiple squash operator; no unconsumed operators
("{[#A][#B]}.{#A=OC[!],#B=[!]CC}",
# 0 1 2 3 4 1 5 3 4 6 7 8
# note that the order here is O H then C H H
# because the C H H is shared between fragments
# so that sorting by fragment id yields this order
# the same goes for the second fragment but here
# the CH2 goes in the front because it also belongs
# to the fragment with lower fragid
[('A', 'O H C H H'), ('B', 'C H H C H H H'),],
'O H C H H C H H H',
[(0, 1), (0, 2), (2, 3), (2, 4), (2, 5),
(5, 6), (5, 7), (5, 8)]),
# smiple squash operator; unconsumed operators
("{[#A][#B]}.{#A=OC[!],#B=[$][!]CC}",
# 0 1 2 3 4 1 5 3 4 6 7 8
# 0 1 2 3 4 5 6 7 11 8 9 10
# note that the unconsumed $ triggers rebuild of a hydrogen
# which however is appended to the end of the molecule so
# making it 11
[('A', 'O H C H H'), ('B', 'C H H C H H H'),],
'O H C H H C H H H',
[(0, 1), (0, 2), (2, 3), (2, 4), (2, 5),
(5, 6), (5, 7), (5, 8)]),
# smiple squash operator; plus connect operator
("{[#A][#B][#C]}.{#A=OC[!],#B=[$][!]CC,#C=[$]O}",
# 0 1 2 3 4 1 5 3 4 6 7 8
# 0 1 2 3 4 5 6 7 11 8 9 10
# note that the unconsumed $ triggers rebuild of a hydrogen
# which however is appended to the end of the molecule so
# making it 11
fgrunewald marked this conversation as resolved.
Show resolved Hide resolved
[('A', 'O H C H'), ('B', 'C H C H H H'), ('C', 'O H')],
'O H C H C H H H O H',
[(0, 1), (0, 2), (2, 3), (2, 4),
(4, 5), (4, 6), (4, 7), (2, 8), (8, 9)]),
))
def test_def_big_smile_parser(smile, ref_nodes, ref_edges):
def test_all_atom_resolve_molecule(smile, ref_frags, elements, ref_edges):
meta_mol, molecule = MoleculeResolver(smile).resolve()
# nx.draw_networkx(meta_mol.molecule, with_labels=True, labels=nx.get_node_attributes(meta_mol.molecule, 'element'))
# plt.show()
for node, ref in zip(meta_mol.nodes, ref_nodes):

# loop and compare fragments first
for node, ref in zip(meta_mol.nodes, ref_frags):
assert meta_mol.nodes[node]['fragname'] == ref[0]
block_graph = meta_mol.nodes[node]['graph']
elements = nx.get_node_attributes(block_graph, 'element') #.values())
sorted_elements = [elements[key] for key in sorted(elements)]
target_elements = nx.get_node_attributes(block_graph, 'element')
sorted_elements = [target_elements[key] for key in sorted(target_elements)]
assert sorted_elements == ref[1].split()
assert sorted(molecule.edges) == sorted(ref_edges)

# make the full scale reference graph
ref_graph = nx.Graph()
ref_graph.add_edges_from(ref_edges)
nx.set_node_attributes(ref_graph,
dict(zip(sorted(ref_graph.nodes), elements.split())),
'element')

def _ele_match(n1, n2):
return n1["element"] == n2["element"]

# check that reference graph and molecule are isomorphic
assert nx.is_isomorphic(ref_graph, molecule, node_match=_ele_match)