Skip to content

Commit

Permalink
Merge pull request #1 from gruenewald-lab/squash-opr
Browse files Browse the repository at this point in the history
[1] Squash opr
  • Loading branch information
fgrunewald authored May 6, 2024
2 parents b2848c7 + 9b91d31 commit 56752fe
Show file tree
Hide file tree
Showing 6 changed files with 314 additions and 182 deletions.
41 changes: 20 additions & 21 deletions cgsmiles/graph_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,13 @@ def merge_graphs(source_graph, target_graph, max_node=None):
# We assume that the last id is always the largest.
last_node_idx = max_node
offset = last_node_idx
fragment_offset = source_graph.nodes[last_node_idx].get('fragid', 0)
fragment_offset = max(source_graph.nodes[last_node_idx].get('fragid', [0])) + 1

correspondence = {}
for idx, node in enumerate(target_graph.nodes(), start=offset + 1):
correspondence[node] = idx
new_atom = copy.copy(target_graph.nodes[node])
new_atom['fragid'] = (new_atom.get('fragid', 0) + fragment_offset)
new_atom['fragid'] = [(new_atom.get('fragid', 0) + fragment_offset)]
source_graph.add_node(idx, **new_atom)

for node1, node2 in target_graph.edges:
Expand All @@ -53,31 +53,27 @@ def merge_graphs(source_graph, target_graph, max_node=None):

return correspondence

def sort_nodes(graph, sortby_attrs=("fragid", "atomid"), target_attr=None):
def sort_nodes_by_attr(graph, sort_attr="fragid"):
"""
Sort nodes in graph by multiple attributes.
Sort nodes in graph by attribute and relable the graph in place.
Parameters
----------
graph: :class:`nx.Graph`
the graph to sort nodes of
sortby_attrs: tuple(str)
the attributes and in which order to sort
target_attr: `abc.hashable`
if not None indices are assigned to this
attribute starting at 1
sort_attr: `abc.hashable`
the attribute to use for sorting
Returns
-------
nx.Graph
graph with nodes sorted in correct order
"""
node_order = sorted(graph, key=partial(_keyfunc, graph, attrs=sortby_attrs))
for new_idx, node_key in enumerate(node_order, 1):
# graph._node.move_to_end(node_key)
if target_attr is not None:
graph.nodes[node_key][target_attr] = new_idx
return graph
attr_values = nx.get_node_attributes(graph, sort_attr)
sorted_ids = sorted(attr_values, key=lambda item: (attr_values[item], item))
mapping = {old: new for new, old in enumerate(sorted_ids)}
new_graph = nx.relabel_nodes(graph, mapping, copy=True)
return new_graph

def _keyfunc(graph, node_idx, attrs):
"""
Expand All @@ -95,23 +91,26 @@ def annotate_fragments(meta_graph, molecule):
"""
node_to_fragids = nx.get_node_attributes(molecule, 'fragid')

fragids = defaultdict(list)
for fragid, node in node_to_fragids.items():
fragids[fragid].append(node)
fragid_to_node = defaultdict(list)
for node, fragids in node_to_fragids.items():
for fragid in fragids:
fragid_to_node[fragid].append(node)

for meta_node in meta_graph.nodes:
# adding node to the fragment graph
graph_frag = nx.Graph()
for node in fragids[meta_node]:
for node in fragid_to_node[meta_node]:
attrs = molecule.nodes[node]
graph_frag.add_node(node, **attrs)

# adding the edges
# this is slow but OK; we always assume that the fragment
# is much much smaller than the fullblown graph
combinations = itertools.combinations(fragids[meta_node], r=2)
combinations = itertools.combinations(fragid_to_node[meta_node], r=2)
for a, b in combinations:
if molecule.has_edge(a, b):
graph.add_edge(a, b)
graph_frag.add_edge(a, b)

meta_graph.nodes[meta_node]['graph'] = graph_frag

return meta_graph
58 changes: 58 additions & 0 deletions cgsmiles/pysmiles_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import pysmiles

VALENCES = pysmiles.smiles_helper.VALENCES
VALENCES.update({"H": (1,)})

def rebuild_h_atoms(mol_graph, keep_bonding=False):
"""
Helper function which add hydrogen atoms to the molecule graph.
First the hcount attribute produced by pysmiles is updated, because
fragments have no bonds at time of reading so pysmiles does not
know the connectivity. Hence the hcount is redone based on the
actual connectivity of the final molecule.
This function also makes sure to tag single hydrogen fragments,
in order to not merge them with adjecent fragments. Otherwise,
explicit single hydrogen residues would not be possible.
The molecule graph is updated in place with the hydrogen atoms
that are missing.
Using the keep_bonding argument the hydrogen count is reduced
by the number of bonding descriptors. In this way hydrogen
atoms can also be added to fragments only.
Parameters
----------
mol_graph: :class:`nx.Graph`
graph describing the full molecule without hydrogen atoms
"""
for node in mol_graph.nodes:
if mol_graph.nodes[node].get('bonding', False):
# get the degree
ele = mol_graph.nodes[node]['element']
# hcount is the valance minus the degree minus
# the number of bonding descriptors
bonds = round(sum([mol_graph.edges[(node, neigh)]['order'] for neigh in\
mol_graph.neighbors(node)]))
charge = mol_graph.nodes[node].get('charge', 0)
hcount = pysmiles.smiles_helper._valence(mol_graph, node, minimum=0) -\
bonds +\
charge
# in this case we only rebuild hydrogen atoms that are not
# replaced by bonding operators.
if keep_bonding:
hcount -= len(mol_graph.nodes[node]['bonding'])

mol_graph.nodes[node]['hcount'] = hcount
if ele == "H":
mol_graph.nodes[node]['single_h_frag'] = True

pysmiles.smiles_helper.add_explicit_hydrogens(mol_graph)
for node in mol_graph.nodes:
if mol_graph.nodes[node].get("element", "*") == "H" and\
not mol_graph.nodes[node].get("single_h_frag", False):
ref_node = next(mol_graph.neighbors(node))
mol_graph.nodes[node]["fragid"] = mol_graph.nodes[ref_node]["fragid"]
mol_graph.nodes[node]["fragname"] = mol_graph.nodes[ref_node]["fragname"]
84 changes: 45 additions & 39 deletions cgsmiles/read_fragments.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,36 @@
import pysmiles
from .read_cgsmiles import read_cgsmiles

class PeekIter(object):
"""
Custom iter that allows looking ahead, without
advancing the actual iter.
"""
def __init__(self, collection):
self.collection = iter(collection)
self._peek = None

def __next__(self):
if self._peek:
item = self._peek
self._peek = None
else:
item = next(self.collection)
return item

def peek(self):
if self._peek:
return self._peek
try:
self._peek = next(self)
except StopIteration:
self._peek = None
return self._peek

def __iter__(self):
return self


def strip_bonding_descriptors(fragment_string):
"""
Processes a CGBigSmile fragment string by
Expand All @@ -27,21 +57,30 @@ def strip_bonding_descriptors(fragment_string):
a dict mapping bonding descriptors
to the nodes within the string
"""
smile_iter = iter(fragment_string)
bond_to_order = {'-': 1, '=': 2, '#': 3, '$': 4, ':': 1.5, '.': 0}
smile_iter = PeekIter(fragment_string)
bonding_descrpt = defaultdict(list)
smile = ""
node_count = 0
prev_node = 0
current_order = None
for token in smile_iter:
if token == '[':
peek = next(smile_iter)
if peek in ['$', '>', '<']:
if peek in ['$', '>', '<', '!']:
bond_descrp = peek
peek = next(smile_iter)
while peek != ']':
bond_descrp += peek
peek = next(smile_iter)
bonding_descrpt[prev_node].append(bond_descrp)
if smile_iter.peek() in bond_to_order:
order = bond_to_order[next(smile_iter)]
elif current_order:
order = current_order
current_order = None
else:
order = 1
bonding_descrpt[prev_node].append(bond_descrp + str(order))
else:
atom = token
while peek != ']':
Expand All @@ -59,47 +98,16 @@ def strip_bonding_descriptors(fragment_string):
elif token == ')':
prev_node = anchor
smile += token
elif token in bond_to_order:
current_order = bond_to_order[token]
else:
if token not in '] H @ . - = # $ : / \\ + - %'\
if token not in '] H @ $ / \\ + - %'\
and not token.isdigit():
prev_node = node_count
node_count += 1
smile += token
return smile, bonding_descrpt

def _rebuild_h_atoms(mol_graph):
# special hack around to fix
# pysmiles bug for a single
# atom molecule; we assume that the
# hcount is just wrong and set it to
# the valance number minus bonds minus
# bonding connectors
if len(mol_graph.nodes) == 1:
ele = mol_graph.nodes[0]['element']
# for N and P we assume the regular valency
hcount = pysmiles.smiles_helper.VALENCES[ele][0]
if mol_graph.nodes[0].get('bonding', False):
hcount -= 1
mol_graph.nodes[0]['hcount'] = hcount
else:
for node in mol_graph.nodes:
if mol_graph.nodes[node].get('bonding', False):
# get the degree
ele = mol_graph.nodes[node]['element']
# hcount is the valance minus the degree minus
# the number of bonding descriptors
bonds = round(sum([mol_graph.edges[(node, neigh)]['order'] for neigh in\
mol_graph.neighbors(node)]))
charge = mol_graph.nodes[node].get('charge', 0)
hcount = pysmiles.smiles_helper.VALENCES[ele][0] -\
bonds -\
len(mol_graph.nodes[node]['bonding']) +\
charge
mol_graph.nodes[node]['hcount'] = hcount

pysmiles.smiles_helper.add_explicit_hydrogens(mol_graph)
return mol_graph

def fragment_iter(fragment_str, all_atom=True):
"""
Iterates over fragments defined in a CGBigSmile string.
Expand Down Expand Up @@ -137,8 +145,6 @@ def fragment_iter(fragment_str, all_atom=True):
elif all_atom:
mol_graph = pysmiles.read_smiles(smile)
nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding')
# we need to rebuild hydrogen atoms now
_rebuild_h_atoms(mol_graph)
# we deal with a CG resolution graph
else:
mol_graph = read_cgsmiles(smile)
Expand Down
Loading

0 comments on commit 56752fe

Please sign in to comment.