Skip to content

Commit

Permalink
Merge pull request #20 from gruenewald-lab/isomerism
Browse files Browse the repository at this point in the history
implement isomerism bug fixes plus tests
  • Loading branch information
fgrunewald authored Oct 7, 2024
2 parents 2aaf60b + 3d58e50 commit 4430b6b
Show file tree
Hide file tree
Showing 6 changed files with 371 additions and 35 deletions.
33 changes: 32 additions & 1 deletion cgsmiles/graph_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@ def merge_graphs(source_graph, target_graph, max_node=None):
correspondence[node] = idx
new_atom = copy.deepcopy(target_graph.nodes[node])
new_atom['fragid'] = [(new_atom.get('fragid', 0) + fragment_offset)]
# make sure to propagate the ez isomers
if 'ez_isomer_atoms' in new_atom:
new_atom['ez_isomer_atoms'] = (new_atom['ez_isomer_atoms'][0]+offset+1,
new_atom['ez_isomer_atoms'][1]+offset+1)
# make sure to propagate the ez atoms
source_graph.add_node(idx, **new_atom)

for node1, node2 in target_graph.edges:
Expand All @@ -53,7 +58,9 @@ def merge_graphs(source_graph, target_graph, max_node=None):

return correspondence

def sort_nodes_by_attr(graph, sort_attr="fragid"):
def sort_nodes_by_attr(graph,
sort_attr="fragid",
relative_attr=[('ez_isomer_atoms', True)]):
"""
Sort nodes in graph by attribute and relable the graph in place.
Expand All @@ -63,6 +70,13 @@ def sort_nodes_by_attr(graph, sort_attr="fragid"):
the graph to sort nodes of
sort_attr: `abc.hashable`
the attribute to use for sorting
relative_attr: tuple(str, bool)
a list of attributes that are sensetive
to the ordering of nodes (i.e. refer to
other nodes). The second element indicates
the depth. If False the value of attr are
node keys. If True we expect the value to
be an itertable with node keys.
Returns
-------
Expand All @@ -73,6 +87,23 @@ def sort_nodes_by_attr(graph, sort_attr="fragid"):
sorted_ids = sorted(attr_values, key=lambda item: (attr_values[item], item))
mapping = {old: new for new, old in enumerate(sorted_ids)}
new_graph = nx.relabel_nodes(graph, mapping, copy=True)
for attr, is_list in relative_attr:
attr_dict = nx.get_node_attributes(new_graph, attr)
new_dict = {}
for key, values in attr_dict.items():
try:
iter(values)
except TypeError:
is_list = False
else:
is_list = not isinstance(values, str)
if is_list:
new_values = [mapping[value] for value in values]
else:
new_values = mapping[values]
new_dict[key] = new_values
if new_dict:
nx.set_node_attributes(new_graph, new_dict, attr)
return new_graph

def _keyfunc(graph, node_idx, attrs):
Expand Down
89 changes: 89 additions & 0 deletions cgsmiles/pysmiles_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
import logging
import networkx as nx
import pysmiles
from pysmiles.smiles_helper import (_annotate_ez_isomers,
_mark_chiral_atoms,
remove_explicit_hydrogens,
add_explicit_hydrogens)

LOGGER = logging.getLogger(__name__)

def compute_mass(input_molecule):
"""
Expand Down Expand Up @@ -76,3 +83,85 @@ def rebuild_h_atoms(mol_graph, keep_bonding=False):
ref_node = next(mol_graph.neighbors(node))
mol_graph.nodes[node]["fragid"] = mol_graph.nodes[ref_node]["fragid"]
mol_graph.nodes[node]["fragname"] = mol_graph.nodes[ref_node]["fragname"]

def annotate_ez_isomers(molecule):
"""
Small wrapper dealing with ez_isomer annotation.
Parameters
----------
molecule: nx.Graph
The molecule of interest, which must of ez_isomer_pairs
and ez_isomer_class set as node attributes
"""
ez_isomer_atoms = nx.get_node_attributes(molecule, 'ez_isomer_atoms')
ez_isomer_class = nx.get_node_attributes(molecule, 'ez_isomer_class')
ez_isomer_atoms_list = [atoms + [_class] for atoms, _class in zip(ez_isomer_atoms.values(), ez_isomer_class.values())]
ez_isomer_pairs = list(zip(ez_isomer_atoms_list[:-1], ez_isomer_atoms_list[1:]))
if len(ez_isomer_atoms)%2 != 0:
msg = ("You have an uneven amount of atoms marked as CIS/TRANS isomers."
"We will drop the last atom from assigning the iosmers.")
LOGGER.warning(msg)
_annotate_ez_isomers(molecule, ez_isomer_pairs)
# clean up
for node in ez_isomer_atoms:
del molecule.nodes[node]['ez_isomer_atoms']
del molecule.nodes[node]['ez_isomer_class']

def mark_chiral_atoms(molecule):
"""
For all nodes tagged as chiral, figure out the three
substituents and annotate the node with a tuple that
has the order in which to rotate. This essentially
corresponds to the definition of an improper dihedral
angle centered on the chiral atom.
Pysmiles treats explicit hydrogen atoms differently
from implicit ones. In cgsmiles at the time when this
function is called all hydrogen atoms have been added
explicitly. However, we need to correct the chirality
assigment for this.
Note that this means it is not possible to have explicit
hydrogen atoms attached to chiral centers within cgsmiles
to begin with. However, this is a rather edge case.
"""
chiral_nodes = nx.get_node_attributes(molecule, 'rs_isomer')
for node, (direction, rings) in chiral_nodes.items():
# first the ring atoms in order
# that they were connected then the
# other neighboring atoms in order
bonded_neighbours = sorted(molecule[node])
neighbours = list(rings)
hstash = []
for neighbour in bonded_neighbours:
if neighbour not in neighbours:
# all hydrogen atoms are explicit in cgsmiles
# BUT in the input of chirality they are usual
# implicit. At this point we need to treat all
# hydrogen atoms as if they were implicit.
if molecule.nodes[neighbour]['element'] != 'H':
neighbours.append(neighbour)
else:
hstash.append(neighbour)
if hstash and len(hstash) == 1:
neighbours.insert(1, hstash[0])
elif len(hstash) > 1:
msg = (f"Chiral node {node} as more than 1 hydrogen neighbor."
"Therefore it is not chiral.")
raise ValueError(msg)

if len(neighbours) != 4:
# FIXME Tetrahedral Allene-like Systems such as `NC(Br)=[C@]=C(O)C`
msg = (f"Chiral node {node} has {len(neighbours)} neighbors, which "
"is different than the four expected for tetrahedral "
"chirality.")
raise ValueError(msg)
# the default is anti-clockwise sorting indicated by '@'
# in this case the nodes are sorted with increasing
# node index; however @@ means clockwise and the
# order of nodes is reversed (i.e. with decreasing index)
if direction == '@@':
neighbours = [neighbours[0], neighbours[1], neighbours[3], neighbours[2]]

molecule.nodes[node]['rs_isomer'] = tuple(neighbours)
106 changes: 101 additions & 5 deletions cgsmiles/read_fragments.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,65 @@ def peek(self):
def __iter__(self):
return self

def _find_bonded_ring_node(ring_nodes, node):
current = ring_nodes.index(node)
if current%2 == 0:
other = ring_nodes[current+1]
else:
other = ring_nodes[current-1]
return other

def collect_ring_number(smile_iter, token, node_count, rings):
"""
When a ring identifier is found, this function will add
the current node to the rings dict.
Parameters
----------
smile_iter: :class:PeekIter
token: str
node_count: int
rings: dict[list]
Returns
-------
PeekIter
the advanced smiles_iter
str
the current token being processed
str
the ring id
dict[list]
the updated rings dict
"""
multi_ring = False
ring_token = token
partial_str = ""
while True:
if multi_ring and token == '%':
rings[ring_token].append(node_count)
elif multi_ring and token.isdigit():
ring_token += token
elif token == '%':
ring_token += token
multi_ring = True
elif multi_ring:
rings[ring_token].append(node_count)
ring_token = ""
elif token.isdigit():
rings[token].append(node_count)

partial_str += token
token = smile_iter.peek()
if token and not token.isdigit() and not token == '%':
break

try:
token = next(smile_iter)
except StopIteration:
break

return smile_iter, token, partial_str, rings

def strip_bonding_descriptors(fragment_string):
"""
Expand All @@ -60,6 +119,9 @@ def strip_bonding_descriptors(fragment_string):
bond_to_order = {'-': 1, '=': 2, '#': 3, '$': 4, ':': 1.5, '.': 0}
smile_iter = PeekIter(fragment_string)
bonding_descrpt = defaultdict(list)
rings = defaultdict(list)
ez_isomer_atoms = {}
rs_isomers = {}
smile = ""
node_count = 0
prev_node = 0
Expand All @@ -84,12 +146,19 @@ def strip_bonding_descriptors(fragment_string):
else:
atom = token
while peek != ']':
atom += peek
# deal with rs chirality
if peek == '@':
chiral_token = peek
if smile_iter.peek() == '@':
chiral_token = '@' + next(smile_iter)
rs_isomers[node_count] = (chiral_token, [])
else:
atom += peek
peek = next(smile_iter)

smile = smile + atom + "]"
prev_node = node_count
node_count += 1

elif token == '(':
anchor = prev_node
smile += token
Expand All @@ -99,8 +168,22 @@ def strip_bonding_descriptors(fragment_string):
elif token in bond_to_order:
current_order = bond_to_order[token]
smile += token
elif token in '] H @ . - = # $ : / \\ + - %' or token.isdigit():
# for chirality assignment we need to collect rings
elif token == '%' or token.isdigit():
smile_iter, token, part_str, rings = collect_ring_number(smile_iter,
token,
prev_node,
rings)
smile += part_str
elif token in '] H . - = # $ : + -':
smile += token
# deal with ez isomers
elif token in '/ \\':
# we have a branch
if node_count == 0:
ez_isomer_atoms[node_count] = (node_count, 1, token)
else:
ez_isomer_atoms[node_count] = (node_count, prev_node, token)
else:
if smile_iter.peek() and token + smile_iter.peek() in ['Cl', 'Br', 'Si', 'Mg', 'Na']:
smile += (token + next(smile_iter))
Expand All @@ -110,7 +193,13 @@ def strip_bonding_descriptors(fragment_string):
prev_node = node_count
node_count += 1

return smile, bonding_descrpt
# we need to annotate rings to the chiral isomers
for node in rs_isomers:
for ring_idx, ring_nodes in rings.items():
if node in ring_nodes:
bonded_node = _find_bonded_ring_node(ring_nodes, node)
rs_isomers[node][1].append(bonded_node)
return smile, bonding_descrpt, rs_isomers, ez_isomer_atoms

def fragment_iter(fragment_str, all_atom=True):
"""
Expand Down Expand Up @@ -140,14 +229,21 @@ def fragment_iter(fragment_str, all_atom=True):
delim = fragment.find('=', 0)
fragname = fragment[1:delim]
big_smile = fragment[delim+1:]
smile, bonding_descrpt = strip_bonding_descriptors(big_smile)
smile, bonding_descrpt, rs_isomers, ez_isomers = strip_bonding_descriptors(big_smile)
if smile == "H":
mol_graph = nx.Graph()
mol_graph.add_node(0, element="H", bonding=bonding_descrpt[0])
nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding')
elif all_atom:
mol_graph = pysmiles.read_smiles(smile, reinterpret_aromatic=False, strict=False)
nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding')
nx.set_node_attributes(mol_graph, rs_isomers, 'rs_isomer')
# we need to split countable node keys and the associated value
ez_isomer_atoms = {idx: val[:-1] for idx, val in ez_isomers.items()}
ez_isomer_class = {idx: val[-1] for idx, val in ez_isomers.items()}
nx.set_node_attributes(mol_graph, ez_isomer_atoms, 'ez_isomer_atoms')
nx.set_node_attributes(mol_graph, ez_isomer_class, 'ez_isomer_class')
print("hcount", mol_graph.nodes(data='hcount'))
# we deal with a CG resolution graph
else:
mol_graph = read_cgsmiles(smile)
Expand Down
13 changes: 12 additions & 1 deletion cgsmiles/resolve.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
sort_nodes_by_attr,
annotate_fragments,
set_atom_names_atomistic)
from .pysmiles_utils import rebuild_h_atoms
from .pysmiles_utils import (rebuild_h_atoms,
annotate_ez_isomers,
mark_chiral_atoms)

def compatible(left, right, legacy=False):
"""
Expand Down Expand Up @@ -357,6 +359,15 @@ def resolve(self):
# sort the atoms
self.molecule = sort_nodes_by_attr(self.molecule, sort_attr=("fragid"))

if all_atom:
# assign chiral atoms
mark_chiral_atoms(self.molecule)
# assign rs isomerism
annotate_ez_isomers(self.molecule)
# in all-atom MD there are common naming conventions
# that might be expected and hence we set them here
set_atom_names_atomistic(self.molecule, self.meta_graph)

# and redo the meta molecule
self.meta_graph = annotate_fragments(self.meta_graph,
self.molecule)
Expand Down
Loading

0 comments on commit 4430b6b

Please sign in to comment.