gruenewald-lab · fgrunewald · Oct 7, 2024 · Sep 12, 2024 · Sep 12, 2024 · Sep 23, 2024
diff --git a/cgsmiles/pysmiles_utils.py b/cgsmiles/pysmiles_utils.py
@@ -1,7 +1,11 @@
 import logging
 import networkx as nx
 import pysmiles
-from pysmiles.smiles_helper import _annotate_ez_isomers
+from pysmiles.smiles_helper import (_annotate_ez_isomers,
+                                    _mark_chiral_atoms,
+                                    remove_explicit_hydrogens,
+                                    add_explicit_hydrogens)
+
 LOGGER = logging.getLogger(__name__)
 
 def compute_mass(input_molecule):
@@ -103,3 +107,57 @@ def annotate_ez_isomers(molecule):
     for node in ez_isomer_atoms:
         del  molecule.nodes[node]['ez_isomer_atoms']
         del  molecule.nodes[node]['ez_isomer_class']
+
+def mark_chiral_atoms(molecule):
+    """
+    For all nodes tagged as chiral, figure out the three
+    substituents and annotate the node with a tuple that
+    has the order in which to rotate. This essentially
+    corresponds to the definition of an improper dihedral
+    angle centered on the chiral atom.
+
+    Pysmiles treats explicit hydrogen atoms differently
+    from implicit ones. In cgsmiles at the time when this
+    function is called all hydrogen atoms have been added
+    explicitly. However, we need to correct the chirality
+    assigment for this.
+
+    Note that this means it is not possible to have explicit
+    hydrogen atoms attached to chiral centers within cgsmiles
+    to begin with. However, this is a rather edge case.
+    """
+    chiral_nodes = nx.get_node_attributes(molecule, 'rs_isomer')
+    for node, (direction, rings) in chiral_nodes.items():
+        # first the ring atoms in order
+        # that they were connected then the
+        # other neighboring atoms in order
+        bonded_neighbours = sorted(molecule[node])
+        neighbours = list(rings)
+        hstash = None
+        for neighbour in bonded_neighbours:
+            if neighbour not in neighbours:
+                # all hydrogen atoms are explicit in cgsmiles
+                # BUT in the input of chirality they are usual
+                # implicit. At this point we need to treat all
+                # hydrogen atoms as if they were implicit.
+                if molecule.nodes[neighbour]['element'] != 'H':
+                    neighbours.append(neighbour)
+                else:
+                    hstash = neighbour
+        if hstash:
+            neighbours.insert(1, hstash)
+
+        if len(neighbours) != 4:
+            # FIXME Tetrahedral Allene-like Systems such as `NC(Br)=[C@]=C(O)C`
+            msg = (f"Chiral node {node} has {len(neighbours)} neighbors, which "
+                    "is different than the four expected for tetrahedral "
+                    "chirality.")
+            raise ValueError(msg)
+        # the default is anti-clockwise sorting indicated by '@'
+        # in this case the nodes are sorted with increasing
+        # node index; however @@ means clockwise and the
+        # order of nodes is reversed (i.e. with decreasing index)
+        if direction == '@@':
+            neighbours = [neighbours[0],  neighbours[1], neighbours[3], neighbours[2]]
+
+        molecule.nodes[node]['rs_isomer'] = tuple(neighbours)
diff --git a/cgsmiles/read_fragments.py b/cgsmiles/read_fragments.py
@@ -35,6 +35,45 @@ def peek(self):
     def __iter__(self):
         return self
 
+def _find_bonded_ring_node(ring_nodes, node):
+    current = ring_nodes.index(node)
+    if current%2 == 0:
+        other = ring_nodes[current+1]
+    else:
+        other = ring_nodes[current-1]
+    return other
+
+def deal_with_chiral_rings(smile_iter, token, node_count, rings):
+    """
+    """
+    multi_ring = False
+    ring_token = token
+    partial_str = ""
+    while True:
+        if multi_ring and token == '%':
+            rings[ring_token].append(node_count)
+        elif multi_ring and token.isdigit():
+            ring_token += token
+        elif token == '%':
+            ring_token += token
+            multi_ring = True
+        elif multi_ring:
+            rings[ring_token].append(node_count)
+            ring_token = ""
+        elif token.isdigit():
+            rings[token].append(node_count)
+
+        partial_str += token
+        token = smile_iter.peek()
+        if token and not token.isdigit() and not token == '%':
+            break
+
+        try:
+            token = next(smile_iter)
+        except StopIteration:
+            break
+
+    return smile_iter, token, partial_str, rings
 
 def strip_bonding_descriptors(fragment_string):
     """
@@ -60,6 +99,7 @@ def strip_bonding_descriptors(fragment_string):
     bond_to_order = {'-': 1, '=': 2, '#': 3, '$': 4, ':': 1.5, '.': 0}
     smile_iter = PeekIter(fragment_string)
     bonding_descrpt = defaultdict(list)
+    rings = defaultdict(list)
     ez_isomer_atoms = {}
     rs_isomers = {}
     smile = ""
@@ -95,10 +135,10 @@ def strip_bonding_descriptors(fragment_string):
                     else:
                         atom += peek
                     peek = next(smile_iter)
+
                 smile = smile + atom + "]"
                 prev_node = node_count
                 node_count += 1
-
         elif token == '(':
             anchor = prev_node
             smile += token
@@ -108,7 +148,14 @@ def strip_bonding_descriptors(fragment_string):
         elif token in bond_to_order:
             current_order = bond_to_order[token]
             smile += token
-        elif token in '] H . - = # $ : + - %' or token.isdigit():
+        # for chirality assignment we need to collect rings
+        elif token == '%' or token.isdigit():
+            smile_iter, token, part_str, rings = deal_with_chiral_rings(smile_iter,
+                                                                        token,
+                                                                        prev_node,
+                                                                        rings)
+            smile += part_str
+        elif token in '] H . - = # $ : + -':
             smile += token
         # deal with ez isomers
         elif token in '/ \\':
@@ -126,6 +173,12 @@ def strip_bonding_descriptors(fragment_string):
             prev_node = node_count
             node_count += 1
 
+    # we need to annotate rings to the chiral isomers
+    for node in rs_isomers:
+        for ring_idx, ring_nodes in rings.items():
+            if node in ring_nodes:
+                bonded_node = _find_bonded_ring_node(ring_nodes, node)
+                rs_isomers[node][1].append(bonded_node)
     return smile, bonding_descrpt, rs_isomers, ez_isomer_atoms
 
 def fragment_iter(fragment_str, all_atom=True):
@@ -170,6 +223,7 @@ def fragment_iter(fragment_str, all_atom=True):
             ez_isomer_class = {idx: val[-1] for idx, val in ez_isomers.items()}
             nx.set_node_attributes(mol_graph, ez_isomer_atoms, 'ez_isomer_atoms')
             nx.set_node_attributes(mol_graph, ez_isomer_class, 'ez_isomer_class')
+            print("hcount", mol_graph.nodes(data='hcount'))
         # we deal with a CG resolution graph
         else:
             mol_graph = read_cgsmiles(smile)

diff --git a/cgsmiles/resolve.py b/cgsmiles/resolve.py
@@ -1,14 +1,15 @@
 import re
 import copy
 import networkx as nx
-from pysmiles.smiles_helper import _mark_chiral_atoms
 from .read_cgsmiles import read_cgsmiles
 from .read_fragments import read_fragments
 from .graph_utils import (merge_graphs,
                           sort_nodes_by_attr,
                           annotate_fragments,
                           set_atom_names_atomistic)
-from .pysmiles_utils import rebuild_h_atoms, annotate_ez_isomers
+from .pysmiles_utils import (rebuild_h_atoms,
+                             annotate_ez_isomers,
+                             mark_chiral_atoms)
 
 def compatible(left, right, legacy=False):
     """
@@ -359,8 +360,8 @@ def resolve(self):
         self.molecule = sort_nodes_by_attr(self.molecule, sort_attr=("fragid"))
 
         if all_atom:
-            # assign chirality
-            _mark_chiral_atoms(self.molecule)
+            # assign chiral atoms
+            mark_chiral_atoms(self.molecule)
             # assign rs isomerism
             annotate_ez_isomers(self.molecule)
             # in all-atom MD there are common naming conventions

diff --git a/cgsmiles/tests/test_molecule_resolve.py b/cgsmiles/tests/test_molecule_resolve.py
@@ -185,7 +185,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
                         [(0, 1), (0, 2), (0, 3), (0, 4), (1, 5),
                          (1, 7), (5, 9), (5, 6), (7, 13), (7, 8),
                          (9, 11), (9, 10), (11, 13), (11, 12), (13, 14)], {}, {}),
-                        # smiple chirality assigment between fragments
+                        # simple chirality assigment between fragments
                         ("{[#A][#B][#C]}.{#A=O[>],#C=O[<],#B=[<]C[C@H][>]C(=O)OC}",
                         # 0 1 2 3
                         [('A', 'O H'), ('B', 'C C C O O C H H H H H H'),
@@ -194,8 +194,18 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
                         [(0, 1), (0, 2), (2, 3), (2, 4),
                          (2, 5), (5, 6), (5, 7), (7, 8), (7, 9), (9, 10), (10, 11), (10, 12),
                          (10, 13), (5, 14), (14, 15)],
-                        {3: (2, 4, 10, 14)}, {}),
-                        # smiple chirality assigment between fragments inv
+                        {3: (2, 10, 4, 14)}, {}),
+                        # simple chirality assigment with rings
+                        ("{[#GLC]}.{#GLC=C([C@@H]1[C@H]([C@@H]([C@H]([C@H](O1)O)O)O)O)O}",
+                        # 0 1 2 3
+                        [('GLC', 'C C C C C C O O O O O O H H H H H H H H H H H H')],
+                        'C C C C C C O O O O O O H H H H H H H H H H H H',
+                        [(0, 1), (0, 11), (0, 12), (0, 13), (1, 2), (1, 6), (1, 14), (2, 3), (2, 10),
+                         (2, 15), (3, 4), (3, 9), (3, 16), (4, 5), (4, 8), (4, 17), (5, 6), (5, 7), (5, 18),
+                         (7, 19), (8, 20), (9, 21), (10, 22), (11, 23)],
+                        {1: (6, 14, 2, 0), 2: (1, 15, 3, 10), 3: (2, 16, 9, 4),
+                         4: (3, 17, 5, 8), 5: (4, 18, 6, 7)}, {}),
+                        # simple chirality assigment between fragments inv
                         ("{[#A][#B][#C]}.{#A=O[>],#C=O[<],#B=[<]C[C@@H][>]C(=O)OC}",
                         # 0 1 2 3
                         [('A', 'O H'), ('B', 'C C C O O C H H H H H H'),
@@ -204,15 +214,15 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
                         [(0, 1), (0, 2), (2, 3), (2, 4),
                          (2, 5), (5, 6), (5, 7), (7, 8), (7, 9), (9, 10), (10, 11), (10, 12),
                          (10, 13), (5, 14), (14, 15)],
-                        {3: (2, 4, 14, 10)}, {}),
+                        {3: (2, 10, 14, 4)}, {}),
                         # smiple ez isomerism assigment between fragments inv
                         ("{[#A][#B]}.{#A=CC(/F)=[$],#B=[$]=C(\F)C}",
                         [('A', 'C C F H H H'), ('B', 'C F C H H H')],
                         'C C F H H H F C C H H H',
                         [(0, 1), (1, 2), (0, 3), (0, 4),
                          (0, 5), (1, 7), (7, 6), (7, 8), (8, 9), (8, 10), (8, 11)],
                         {}, {2: (2, 1, 6, 7, 'trans'), 7: (7, 6, 1, 2, 'trans')}),
-                        # smiple ez isomerism assigment between fragments inv
+                        # simple ez isomerism assigment between fragments inv
                         ("{[#A][#B]}.{#A=CC(/F)=[$],#B=[$]=C(/F)C}",
                         [('A', 'C C F H H H'), ('B', 'C F C H H H')],
                         'C C F H H H F C C H H H',