gruenewald-lab · fgrunewald · May 6, 2024 · Apr 12, 2024 · Apr 12, 2024 · Apr 14, 2024
diff --git a/cgsmiles/graph_utils.py b/cgsmiles/graph_utils.py
@@ -69,10 +69,9 @@ def sort_nodes_by_attr(graph, sort_attr="fragid"):
     nx.Graph
         graph with nodes sorted in correct order
     """
-    fragids = nx.get_node_attributes(graph, "fragid")
-    sorted_ids = sorted(fragids.items(), key=lambda item: (item[1], item[0]))
-    print(sorted_ids)
-    mapping = {old[0]: new for new, old in enumerate(sorted_ids)}
+    attr_values = nx.get_node_attributes(graph, sort_attr)
+    sorted_ids = sorted(attr_values, key=lambda item: (attr_values[item], item))
+    mapping = {old: new for new, old in enumerate(sorted_ids)}
     new_graph = nx.relabel_nodes(graph, mapping, copy=True)
     return new_graph
 

diff --git a/cgsmiles/pysmiles_utils.py b/cgsmiles/pysmiles_utils.py
@@ -7,7 +7,7 @@ def rebuild_h_atoms(mol_graph, keep_bonding=False):
     """
     Helper function which add hydrogen atoms to the molecule graph.
 
-    First the hcount attribute produced by pysmiles us updated, because
+    First the hcount attribute produced by pysmiles is updated, because
     fragments have no bonds at time of reading so pysmiles does not
     know the connectivity. Hence the hcount is redone based on the
     actual connectivity of the final molecule.
@@ -37,7 +37,7 @@ def rebuild_h_atoms(mol_graph, keep_bonding=False):
             bonds = round(sum([mol_graph.edges[(node, neigh)]['order'] for neigh in\
                                mol_graph.neighbors(node)]))
             charge = mol_graph.nodes[node].get('charge', 0)
-            hcount = pysmiles.smiles_helper.VALENCES[ele][0] -\
+            hcount = pysmiles.smiles_helper._valence(mol_graph, node, minimum=0) -\
                      bonds +\
                      charge
             # in this case we only rebuild hydrogen atoms that are not

diff --git a/cgsmiles/read_fragments.py b/cgsmiles/read_fragments.py
@@ -6,6 +6,35 @@
 import pysmiles
 from .read_cgsmiles import read_cgsmiles
 
+class PeekIter(object):
+    """
+    Custom iter that allows looking ahead, without
+    advancing the actual iter.
+    """
+    def __init__(self, collection):
+        self.collection = collection
+        self.index = 0
+
+    def __next__(self):
+        try:
+            result = self.collection[self.index]
+            self.index += 1
+        except IndexError:
+            raise StopIteration
+        return result
+
+    def peek(self):
+        try:
+            result = self.collection[self.index]
+        except IndexError:
+            return ""
+        return result
+
+    def __iter__(self):
+        self.index = 0
+        return self
+
+
 def strip_bonding_descriptors(fragment_string):
     """
     Processes a CGBigSmile fragment string by
@@ -27,11 +56,13 @@ def strip_bonding_descriptors(fragment_string):
         a dict mapping bonding descriptors
         to the nodes within the string
     """
-    smile_iter = iter(fragment_string)
+    bond_to_order = {'-': 1, '=': 2, '#': 3, '$': 4, ':': 1.5, '.': 0}
+    smile_iter = PeekIter(fragment_string)
     bonding_descrpt = defaultdict(list)
     smile = ""
     node_count = 0
     prev_node = 0
+    current_order = None
     for token in smile_iter:
         if token == '[':
             peek = next(smile_iter)
@@ -41,7 +72,14 @@ def strip_bonding_descriptors(fragment_string):
                 while peek != ']':
                     bond_descrp += peek
                     peek = next(smile_iter)
-                bonding_descrpt[prev_node].append(bond_descrp)
+                if smile_iter.peek() in bond_to_order:
+                    order = bond_to_order[next(smile_iter)]
+                elif current_order:
+                    order = current_order
+                    current_order = None
+                else:
+                    order = 1
+                bonding_descrpt[prev_node].append(bond_descrp + str(order))
             else:
                 atom = token
                 while peek != ']':
@@ -59,8 +97,10 @@ def strip_bonding_descriptors(fragment_string):
         elif token == ')':
             prev_node = anchor
             smile += token
+        elif token in bond_to_order:
+            current_order = bond_to_order[token]
         else:
-            if token not in '] H @ . - = # $ : / \\ + - %'\
+            if token not in '] H @ $ / \\ + - %'\
                 and not token.isdigit():
                 prev_node = node_count
                 node_count += 1

diff --git a/cgsmiles/resolve.py b/cgsmiles/resolve.py
@@ -21,7 +21,7 @@ def compatible(left, right):
     -------
     bool
     """
-    if left == right and left not in '> <':
+    if left == right and left[0] not in '> <':
         return True
     l, r = left[0], right[0]
     if (l, r) == ('<', '>') or (l, r) == ('>', '<'):
@@ -177,9 +177,7 @@ def edges_from_bonding_descrpt(self):
 
             # bonding descriptors are assumed to have bonding order 1
             # unless they are specifically annotated
-            order = re.findall("\d+\.\d+", bonding[0])
-            if not order:
-                order = 1
+            order = int(bonding[0][-1])
             self.molecule.add_edge(edge[0], edge[1], bonding=bonding, order=order)
 
     def squash_atoms(self):
@@ -202,8 +200,7 @@ def squash_atoms(self):
                                                 self_loops=False)
 
             # add the fragment id of the sequashed node
-            self.molecule.nodes[node_to_keep]['fragid'] +=\
-            self.molecule.nodes[node_to_keep]['contraction'][node_to_remove]['fragid']
+            self.molecule.nodes[node_to_keep]['fragid'] += self.molecule.nodes[node_to_keep]['contraction'][node_to_remove]['fragid']
 
     def resolve(self):
 

diff --git a/cgsmiles/tests/test_cgsmile_parsing.py b/cgsmiles/tests/test_cgsmile_parsing.py
@@ -127,31 +127,35 @@ def test_read_cgsmiles(smile, nodes, edges):
                         # smiple symmetric bonding
                         ("[$]COC[$]",
                          "COC",
-                        {0: ["$"], 2: ["$"]}),
+                        {0: ["$1"], 2: ["$1"]}),
+                        # smiple symmetric bonding with more than one name
+                        ("[$1A]COC[$1A]",
+                         "COC",
+                        {0: ["$1A1"], 2: ["$1A1"]}),
                         # simple symmetric but with explicit hydrogen
                         ("[$][CH2]O[CH2][$]",
                          "[CH2]O[CH2]",
-                        {0: ["$"], 2: ["$"]}),
+                        {0: ["$1"], 2: ["$1"]}),
                         # smiple symmetric bonding; multiple descript
                         ("[$]COC[$][$1]",
                          "COC",
-                        {0: ["$"], 2: ["$", "$1"]}),
+                        {0: ["$1"], 2: ["$1", "$11"]}),
                         # named different bonding descriptors
                         ("[$1]CCCC[$2]",
                          "CCCC",
-                        {0: ["$1"], 3: ["$2"]}),
+                        {0: ["$11"], 3: ["$21"]}),
                         # ring and bonding descriptors
                         ("[$1]CC[$2]C1CCCCC1",
                          "CCC1CCCCC1",
-                        {0: ["$1"], 1: ["$2"]}),
+                        {0: ["$11"], 1: ["$21"]}),
                         # bonding descript. after branch
                         ("C(COC[$1])[$2]CCC[$3]",
                          "C(COC)CCC",
-                        {0: ["$2"], 3: ["$1"], 6: ["$3"]}),
+                        {0: ["$21"], 3: ["$11"], 6: ["$31"]}),
                         # left rigth bonding desciptors
                         ("[>]COC[<]",
                         "COC",
-                        {0: [">"], 2: ["<"]})
+                        {0: [">1"], 2: ["<1"]})
 ))
 def test_strip_bonding_descriptors(big_smile, smile, bonding):
     new_smile, new_bonding = strip_bonding_descriptors(big_smile)
@@ -161,50 +165,50 @@ def test_strip_bonding_descriptors(big_smile, smile, bonding):
 @pytest.mark.parametrize('fragment_str, nodes, edges',(
                         # single fragment
                         ("{#PEO=[$]COC[$]}",
-                        {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$"], "element": "C"}),
-                                 (1, {"atomname": "O1", "fragname": "PEO", "element": "O"}),
-                                 (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$"], "element": "C"}),
+                        {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3}),
+                                 (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0}),
+                                 (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3}),
                                 )},
                         {"PEO": [(0, 1), (1, 2)]}),
                         # single fragment but with explicit hydrogen in smiles
                         ("{#PEO=[$][CH2]O[CH2][$]}",
-                        {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$"], "element": "C"}),
-                                 (1, {"atomname": "O1", "fragname": "PEO", "element": "O"}),
-                                 (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$"], "element": "C"}),
+                        {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2}),
+                                 (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0}),
+                                 (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2}),
                                 )},
                         {"PEO": [(0, 1), (1, 2)]}),
                         # test NH3 terminal
                         ("{#AMM=N[$]}",
-                        {"AMM": ((0, {"atomname": "N0", "fragname": "AMM", "bonding": ["$"], "element": "N"}),
+                        {"AMM": ((0, {"atomname": "N0", "fragname": "AMM", "bonding": ["$1"], "element": "N", "hcount": 3}),
                                 )},
                         {"AMM": []}),
                         # single fragment + 1 terminal (i.e. only 1 bonding descrpt
                         ("{#PEO=[$]COC[$],#OHter=[$][OH]}",
-                        {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$"], "element": "C"}),
-                                 (1, {"atomname": "O1", "fragname": "PEO", "element": "O"}),
-                                 (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$"], "element": "C"}),
+                        {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3}),
+                                 (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0}),
+                                 (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3}),
                                  ),
-                         "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$"], "element": "O"}),)},
+                         "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O"}),)},
                         {"PEO": [(0, 1), (1, 2)],
                          "OHter": []}),
                         # single fragment + 1 terminal but multiple bond descritp.
                         # this adjust the hydrogen count
                         ("{#PEO=[$]COC[$][$1],#OHter=[$][OH]}",
-                        {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$"], "element": "C"}),
-                                 (1, {"atomname": "O1", "fragname": "PEO", "element": "O"}),
-                                 (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$", "$1"], "element": "C"}),
+                        {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3}),
+                                 (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0}),
+                                 (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1", "$11"], "element": "C", "hcount": 3}),
                                  ),
-                         "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$"], "element": "O"}),)},
+                         "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "hcount": 1}),)},
                         {"PEO": [(0, 1), (1, 2)],
                          "OHter": []}),
                         # single fragment + 1 terminal but multiple bond descritp.
                         # but explicit hydrogen in the smiles string
                         ("{#PEO=[$][CH2]O[CH2][$][$1],#OHter=[$][OH]}",
-                        {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$"], "element": "C"}),
-                                 (1, {"atomname": "O1", "fragname": "PEO", "element": "O"}),
-                                 (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$", "$1"], "element": "C"}),
+                        {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2}),
+                                 (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0}),
+                                 (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1", "$11"], "element": "C", "hcount": 2}),
                                  ),
-                         "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$"], "element": "O"}),
+                         "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "hcount": 1}),
                                    )},
                         {"PEO": [(0, 1), (1, 2),],
                          "OHter": []}),

diff --git a/cgsmiles/tests/test_molecule_resolve.py b/cgsmiles/tests/test_molecule_resolve.py
@@ -24,6 +24,11 @@
                          {0: ['>'], 1: ['$5']},
                          (3, 0),
                          ('<', '>')),
+                        # left right with annotated > <
+                        ({0: ['$'], 1: ['>1'], 3: ['<1']},
+                         {0: ['>1'], 1: ['$5']},
+                         (3, 0),
+                         ('<1', '>1')),
                         # left right selective bonding
                         # with identifier
                         ({0: ['$'], 1: ['>'], 3: ['<1']},
@@ -53,6 +58,16 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes):
                         [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7),
                          (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13),
                          (11, 14), (11, 15), (11, 16), (16, 17)]),
+                        # smiple linear seqeunce with bond-order in link
+                        ("{[#TC1][#TC4][#TC1]}.{#TC1=[$1]=CC=[$2],#TC4=[$1]=CC=[$2]}",
+                        #         0 1 2 3 4 5            6 7 8 9
+                        [('TC1', 'C C H H H H'), ('TC4', 'C C H H'),
+                        #       10 11 12 13 14 15
+                         ('TC1', 'C C H H H H')],
+                        'C C H H H H C C H H C C H H H H',
+                        [(0, 1), (0, 2), (1, 3), (1, 4), (1, 5), (0, 6), (6, 7),
+                         (6, 8), (7, 9), (7, 11), (10, 11), (10, 12), (10, 13),
+                         (10, 14), (11, 15)]),
                         # smiple linear seqeunce unconsumed bonding descrpt
                         ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]CO[>]C[$],#OHter=[$][O]}",
                         #           0 1             2 3 4 5 6 7 8
@@ -73,6 +88,16 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes):
                         [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7),
                          (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13),
                          (11, 14), (11, 15), (11, 16), (16, 17)]),
+                        # smiple linear seqeunce with ionic ending
+                        ("{[#OH][#PEO]|2[#ON]}.{#PEO=[$]COC[$],#OH=[$]O,#ON=[$][O-]}",
+                        #           0 1             2 3 4 5 6 7 8
+                        [('OH', 'O H'), ('PEO', 'C O C H H H H'),
+                        #        9 10 11 12 13 14 15         16 17
+                         ('PEO', 'C O C H H H H'), ('ON', 'O')],
+                        'O H C O C H H H H C O C H H H H O',
+                        [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7),
+                         (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13),
+                         (11, 14), (11, 15), (11, 16)]),
                         # uncomsumed bonding IDs; note that this is not the same
                         # molecule as previous test case. Here one of the OH branches
                         # and replaces an CH2 group with CH-OH
@@ -82,9 +107,9 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes):
                         #       9 10 11 12 13 14 15           16 17
                          ('PEO', 'C O C H H H H'), ('OHter', 'O H')],
                         'O H C O C H H H H C O C H H H H O H',
-                        [(0, 1), (0, 2), (2, 3), (2, 5), (2, 9), (3, 4),
+                        [(0, 1), (0, 2), (2, 3), (2, 5), (2, 11), (3, 4),
                          (4, 6), (4, 7), (4, 8), (9, 10), (9, 12), (9, 13),
-                         (10, 11), (11, 15), (11, 14), (11, 16), (16, 17)]),
+                         (10, 11), (11, 15), (11, 14), (9, 16), (16, 17)]),
                         # simple branched sequence
                         ("{[#Hter][#PE]([#PEO][#Hter])[#PE]([#PEO][#Hter])[#Hter]}.{#Hter=[$]H,#PE=[$]CC[$][$],#PEO=[$]COC[$]}",
                         [('Hter', 'H'), ('PE', 'C C H H H'), ('PEO', 'C O C H H H H'), ('Hter', 'H'),
@@ -173,5 +198,9 @@ def test_all_atom_resolve_molecule(smile, ref_frags, elements, ref_edges):
     def _ele_match(n1, n2):
         return n1["element"] == n2["element"]
 
+    print(smile)
+    print(ref_graph.edges)
+    print(molecule.edges)
+    assert ref_graph.edges == molecule.edges
     # check that reference graph and molecule are isomorphic
     assert nx.is_isomorphic(ref_graph, molecule, node_match=_ele_match)