Merge pull request #20 from gruenewald-lab/isomerism

implement isomerism bug fixes plus tests
gruenewald-lab · Oct 7, 2024 · 4430b6b · 4430b6b
2 parents 2aaf60b + 3d58e50
commit 4430b6b
Show file tree

Hide file tree

Showing 6 changed files with 371 additions and 35 deletions.
diff --git a/cgsmiles/graph_utils.py b/cgsmiles/graph_utils.py
@@ -44,6 +44,11 @@ def merge_graphs(source_graph, target_graph, max_node=None):
         correspondence[node] = idx
         new_atom = copy.deepcopy(target_graph.nodes[node])
         new_atom['fragid'] = [(new_atom.get('fragid', 0) + fragment_offset)]
+        # make sure to propagate the ez isomers
+        if 'ez_isomer_atoms' in new_atom:
+            new_atom['ez_isomer_atoms'] = (new_atom['ez_isomer_atoms'][0]+offset+1,
+                                           new_atom['ez_isomer_atoms'][1]+offset+1)
+        # make sure to propagate the ez atoms
         source_graph.add_node(idx, **new_atom)
 
     for node1, node2 in target_graph.edges:
@@ -53,7 +58,9 @@ def merge_graphs(source_graph, target_graph, max_node=None):
 
     return correspondence
 
-def sort_nodes_by_attr(graph, sort_attr="fragid"):
+def sort_nodes_by_attr(graph,
+                       sort_attr="fragid",
+                       relative_attr=[('ez_isomer_atoms', True)]):
     """
     Sort nodes in graph by attribute and relable the graph in place.
 
@@ -63,6 +70,13 @@ def sort_nodes_by_attr(graph, sort_attr="fragid"):
         the graph to sort nodes of
     sort_attr: `abc.hashable`
         the attribute to use for sorting
+    relative_attr: tuple(str, bool)
+        a list of attributes that are sensetive
+        to the ordering of nodes (i.e. refer to
+        other nodes). The second element indicates
+        the depth. If False the value of attr are
+        node keys. If True we expect the value to
+        be an itertable with node keys.
 
     Returns
     -------
@@ -73,6 +87,23 @@ def sort_nodes_by_attr(graph, sort_attr="fragid"):
     sorted_ids = sorted(attr_values, key=lambda item: (attr_values[item], item))
     mapping = {old: new for new, old in enumerate(sorted_ids)}
     new_graph = nx.relabel_nodes(graph, mapping, copy=True)
+    for attr, is_list in relative_attr:
+        attr_dict = nx.get_node_attributes(new_graph, attr)
+        new_dict = {}
+        for key, values in attr_dict.items():
+            try:
+                iter(values)
+            except TypeError:
+                is_list = False
+            else:
+                is_list = not isinstance(values, str)
+            if is_list:
+                new_values = [mapping[value] for value in values]
+            else:
+                new_values = mapping[values]
+            new_dict[key] = new_values
+        if new_dict:
+            nx.set_node_attributes(new_graph, new_dict, attr)
     return new_graph
 
 def _keyfunc(graph, node_idx, attrs):

diff --git a/cgsmiles/pysmiles_utils.py b/cgsmiles/pysmiles_utils.py
@@ -1,5 +1,12 @@
+import logging
 import networkx as nx
 import pysmiles
+from pysmiles.smiles_helper import (_annotate_ez_isomers,
+                                    _mark_chiral_atoms,
+                                    remove_explicit_hydrogens,
+                                    add_explicit_hydrogens)
+
+LOGGER = logging.getLogger(__name__)
 
 def compute_mass(input_molecule):
     """
@@ -76,3 +83,85 @@ def rebuild_h_atoms(mol_graph, keep_bonding=False):
             ref_node = next(mol_graph.neighbors(node))
             mol_graph.nodes[node]["fragid"] = mol_graph.nodes[ref_node]["fragid"]
             mol_graph.nodes[node]["fragname"] = mol_graph.nodes[ref_node]["fragname"]
+
+def annotate_ez_isomers(molecule):
+    """
+    Small wrapper dealing with ez_isomer annotation.
+
+    Parameters
+    ----------
+    molecule: nx.Graph
+        The molecule of interest, which must of ez_isomer_pairs
+        and ez_isomer_class set as node attributes
+    """
+    ez_isomer_atoms = nx.get_node_attributes(molecule, 'ez_isomer_atoms')
+    ez_isomer_class = nx.get_node_attributes(molecule, 'ez_isomer_class')
+    ez_isomer_atoms_list = [atoms + [_class] for atoms, _class in zip(ez_isomer_atoms.values(), ez_isomer_class.values())]
+    ez_isomer_pairs = list(zip(ez_isomer_atoms_list[:-1], ez_isomer_atoms_list[1:]))
+    if len(ez_isomer_atoms)%2 != 0:
+        msg = ("You have an uneven amount of atoms marked as CIS/TRANS isomers."
+               "We will drop the last atom from assigning the iosmers.")
+        LOGGER.warning(msg)
+    _annotate_ez_isomers(molecule, ez_isomer_pairs)
+    # clean up
+    for node in ez_isomer_atoms:
+        del  molecule.nodes[node]['ez_isomer_atoms']
+        del  molecule.nodes[node]['ez_isomer_class']
+
+def mark_chiral_atoms(molecule):
+    """
+    For all nodes tagged as chiral, figure out the three
+    substituents and annotate the node with a tuple that
+    has the order in which to rotate. This essentially
+    corresponds to the definition of an improper dihedral
+    angle centered on the chiral atom.
+
+    Pysmiles treats explicit hydrogen atoms differently
+    from implicit ones. In cgsmiles at the time when this
+    function is called all hydrogen atoms have been added
+    explicitly. However, we need to correct the chirality
+    assigment for this.
+
+    Note that this means it is not possible to have explicit
+    hydrogen atoms attached to chiral centers within cgsmiles
+    to begin with. However, this is a rather edge case.
+    """
+    chiral_nodes = nx.get_node_attributes(molecule, 'rs_isomer')
+    for node, (direction, rings) in chiral_nodes.items():
+        # first the ring atoms in order
+        # that they were connected then the
+        # other neighboring atoms in order
+        bonded_neighbours = sorted(molecule[node])
+        neighbours = list(rings)
+        hstash = []
+        for neighbour in bonded_neighbours:
+            if neighbour not in neighbours:
+                # all hydrogen atoms are explicit in cgsmiles
+                # BUT in the input of chirality they are usual
+                # implicit. At this point we need to treat all
+                # hydrogen atoms as if they were implicit.
+                if molecule.nodes[neighbour]['element'] != 'H':
+                    neighbours.append(neighbour)
+                else:
+                    hstash.append(neighbour)
+        if hstash and len(hstash) == 1:
+            neighbours.insert(1, hstash[0])
+        elif len(hstash) > 1:
+            msg = (f"Chiral node {node} as more than 1 hydrogen neighbor."
+                    "Therefore it is not chiral.")
+            raise ValueError(msg)
+
+        if len(neighbours) != 4:
+            # FIXME Tetrahedral Allene-like Systems such as `NC(Br)=[C@]=C(O)C`
+            msg = (f"Chiral node {node} has {len(neighbours)} neighbors, which "
+                    "is different than the four expected for tetrahedral "
+                    "chirality.")
+            raise ValueError(msg)
+        # the default is anti-clockwise sorting indicated by '@'
+        # in this case the nodes are sorted with increasing
+        # node index; however @@ means clockwise and the
+        # order of nodes is reversed (i.e. with decreasing index)
+        if direction == '@@':
+            neighbours = [neighbours[0],  neighbours[1], neighbours[3], neighbours[2]]
+
+        molecule.nodes[node]['rs_isomer'] = tuple(neighbours)
diff --git a/cgsmiles/read_fragments.py b/cgsmiles/read_fragments.py
@@ -35,6 +35,65 @@ def peek(self):
     def __iter__(self):
         return self
 
+def _find_bonded_ring_node(ring_nodes, node):
+    current = ring_nodes.index(node)
+    if current%2 == 0:
+        other = ring_nodes[current+1]
+    else:
+        other = ring_nodes[current-1]
+    return other
+
+def collect_ring_number(smile_iter, token, node_count, rings):
+    """
+    When a ring identifier is found, this function will add
+    the current node to the rings dict.
+
+    Parameters
+    ----------
+    smile_iter: :class:PeekIter
+    token: str
+    node_count: int
+    rings: dict[list]
+
+    Returns
+    -------
+    PeekIter
+        the advanced smiles_iter
+    str
+        the current token being processed
+    str
+        the ring id
+    dict[list]
+        the updated rings dict
+    """
+    multi_ring = False
+    ring_token = token
+    partial_str = ""
+    while True:
+        if multi_ring and token == '%':
+            rings[ring_token].append(node_count)
+        elif multi_ring and token.isdigit():
+            ring_token += token
+        elif token == '%':
+            ring_token += token
+            multi_ring = True
+        elif multi_ring:
+            rings[ring_token].append(node_count)
+            ring_token = ""
+        elif token.isdigit():
+            rings[token].append(node_count)
+
+        partial_str += token
+        token = smile_iter.peek()
+        if token and not token.isdigit() and not token == '%':
+            break
+
+        try:
+            token = next(smile_iter)
+        except StopIteration:
+            break
+
+    return smile_iter, token, partial_str, rings
 
 def strip_bonding_descriptors(fragment_string):
     """
@@ -60,6 +119,9 @@ def strip_bonding_descriptors(fragment_string):
     bond_to_order = {'-': 1, '=': 2, '#': 3, '$': 4, ':': 1.5, '.': 0}
     smile_iter = PeekIter(fragment_string)
     bonding_descrpt = defaultdict(list)
+    rings = defaultdict(list)
+    ez_isomer_atoms = {}
+    rs_isomers = {}
     smile = ""
     node_count = 0
     prev_node = 0
@@ -84,12 +146,19 @@ def strip_bonding_descriptors(fragment_string):
             else:
                 atom = token
                 while peek != ']':
-                    atom += peek
+                    # deal with rs chirality
+                    if peek == '@':
+                        chiral_token = peek
+                        if smile_iter.peek() == '@':
+                            chiral_token = '@' + next(smile_iter)
+                        rs_isomers[node_count] = (chiral_token, [])
+                    else:
+                        atom += peek
                     peek = next(smile_iter)
+
                 smile = smile + atom + "]"
                 prev_node = node_count
                 node_count += 1
-
         elif token == '(':
             anchor = prev_node
             smile += token
@@ -99,8 +168,22 @@ def strip_bonding_descriptors(fragment_string):
         elif token in bond_to_order:
             current_order = bond_to_order[token]
             smile += token
-        elif token in '] H @ . - = # $ : / \\ + - %' or token.isdigit():
+        # for chirality assignment we need to collect rings
+        elif token == '%' or token.isdigit():
+            smile_iter, token, part_str, rings = collect_ring_number(smile_iter,
+                                                                     token,
+                                                                     prev_node,
+                                                                     rings)
+            smile += part_str
+        elif token in '] H . - = # $ : + -':
             smile += token
+        # deal with ez isomers
+        elif token in '/ \\':
+            # we have a branch
+            if node_count == 0:
+                ez_isomer_atoms[node_count] = (node_count, 1, token)
+            else:
+                ez_isomer_atoms[node_count] = (node_count, prev_node, token)
         else:
             if smile_iter.peek() and token + smile_iter.peek() in ['Cl', 'Br', 'Si', 'Mg', 'Na']:
                 smile += (token + next(smile_iter))
@@ -110,7 +193,13 @@ def strip_bonding_descriptors(fragment_string):
             prev_node = node_count
             node_count += 1
 
-    return smile, bonding_descrpt
+    # we need to annotate rings to the chiral isomers
+    for node in rs_isomers:
+        for ring_idx, ring_nodes in rings.items():
+            if node in ring_nodes:
+                bonded_node = _find_bonded_ring_node(ring_nodes, node)
+                rs_isomers[node][1].append(bonded_node)
+    return smile, bonding_descrpt, rs_isomers, ez_isomer_atoms
 
 def fragment_iter(fragment_str, all_atom=True):
     """
@@ -140,14 +229,21 @@ def fragment_iter(fragment_str, all_atom=True):
         delim = fragment.find('=', 0)
         fragname = fragment[1:delim]
         big_smile = fragment[delim+1:]
-        smile, bonding_descrpt = strip_bonding_descriptors(big_smile)
+        smile, bonding_descrpt, rs_isomers, ez_isomers = strip_bonding_descriptors(big_smile)
         if smile == "H":
             mol_graph = nx.Graph()
             mol_graph.add_node(0, element="H", bonding=bonding_descrpt[0])
             nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding')
         elif all_atom:
             mol_graph = pysmiles.read_smiles(smile, reinterpret_aromatic=False, strict=False)
             nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding')
+            nx.set_node_attributes(mol_graph, rs_isomers, 'rs_isomer')
+            # we need to split countable node keys and the associated value
+            ez_isomer_atoms = {idx: val[:-1] for idx, val in ez_isomers.items()}
+            ez_isomer_class = {idx: val[-1] for idx, val in ez_isomers.items()}
+            nx.set_node_attributes(mol_graph, ez_isomer_atoms, 'ez_isomer_atoms')
+            nx.set_node_attributes(mol_graph, ez_isomer_class, 'ez_isomer_class')
+            print("hcount", mol_graph.nodes(data='hcount'))
         # we deal with a CG resolution graph
         else:
             mol_graph = read_cgsmiles(smile)

diff --git a/cgsmiles/resolve.py b/cgsmiles/resolve.py
@@ -7,7 +7,9 @@
                           sort_nodes_by_attr,
                           annotate_fragments,
                           set_atom_names_atomistic)
-from .pysmiles_utils import rebuild_h_atoms
+from .pysmiles_utils import (rebuild_h_atoms,
+                             annotate_ez_isomers,
+                             mark_chiral_atoms)
 
 def compatible(left, right, legacy=False):
     """
@@ -357,6 +359,15 @@ def resolve(self):
         # sort the atoms
         self.molecule = sort_nodes_by_attr(self.molecule, sort_attr=("fragid"))
 
+        if all_atom:
+            # assign chiral atoms
+            mark_chiral_atoms(self.molecule)
+            # assign rs isomerism
+            annotate_ez_isomers(self.molecule)
+            # in all-atom MD there are common naming conventions
+            # that might be expected and hence we set them here
+            set_atom_names_atomistic(self.molecule, self.meta_graph)
+
         # and redo the meta molecule
         self.meta_graph = annotate_fragments(self.meta_graph,
                                              self.molecule)