Merge pull request #12 from gruenewald-lab/rings_new

Rings new
gruenewald-lab · Jul 8, 2024 · ca10d4f · ca10d4f
2 parents 56752fe + 70fec21
commit ca10d4f
Show file tree

Hide file tree

Showing 8 changed files with 195 additions and 105 deletions.
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -16,7 +16,7 @@ jobs:
 
     strategy:
       matrix:
-       py_version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
+       py_version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
 
     steps:
     - uses: actions/checkout@v2
@@ -28,6 +28,7 @@ jobs:
       run: |
         pip install --upgrade setuptools pip
         pip install --upgrade .
+        pip install git+https://github.com/pckroon/pysmiles.git
         pip install -r requirements-tests.txt
         
     - name: Run pytest with codecoverage

diff --git a/cgsmiles/pysmiles_utils.py b/cgsmiles/pysmiles_utils.py
@@ -1,8 +1,6 @@
+import networkx as nx
 import pysmiles
 
-VALENCES = pysmiles.smiles_helper.VALENCES
-VALENCES.update({"H": (1,)})
-
 def rebuild_h_atoms(mol_graph, keep_bonding=False):
     """
     Helper function which add hydrogen atoms to the molecule graph.
@@ -29,27 +27,25 @@ def rebuild_h_atoms(mol_graph, keep_bonding=False):
         graph describing the full molecule without hydrogen atoms
     """
     for node in mol_graph.nodes:
-        if mol_graph.nodes[node].get('bonding', False):
-            # get the degree
-            ele = mol_graph.nodes[node]['element']
-            # hcount is the valance minus the degree minus
-            # the number of bonding descriptors
-            bonds = round(sum([mol_graph.edges[(node, neigh)]['order'] for neigh in\
-                               mol_graph.neighbors(node)]))
-            charge = mol_graph.nodes[node].get('charge', 0)
-            hcount = pysmiles.smiles_helper._valence(mol_graph, node, minimum=0) -\
-                     bonds +\
-                     charge
-            # in this case we only rebuild hydrogen atoms that are not
-            # replaced by bonding operators.
-            if keep_bonding:
-                hcount -= len(mol_graph.nodes[node]['bonding'])
-
-            mol_graph.nodes[node]['hcount'] = hcount
-            if ele == "H":
-                mol_graph.nodes[node]['single_h_frag'] = True
+        if mol_graph.nodes[node].get('aromatic', False):
+            mol_graph.nodes[node]['hcount'] = 0
+
+        if mol_graph.nodes[node].get('bonding', False) and  \
+        mol_graph.nodes[node].get('element', '*') == "H":
+            mol_graph.nodes[node]['single_h_frag'] = True
+
+    for edge in mol_graph.edges:
+        if mol_graph.edges[edge]['order'] == 1.5:
+            mol_graph.edges[edge]['order'] = 1
 
+    pysmiles.smiles_helper.mark_aromatic_atoms(mol_graph, strict=False)
+    pysmiles.smiles_helper.mark_aromatic_edges(mol_graph)
+
+    nx.set_node_attributes(mol_graph, 0, 'hcount')
+
+    pysmiles.smiles_helper.fill_valence(mol_graph, respect_hcount=False)
     pysmiles.smiles_helper.add_explicit_hydrogens(mol_graph)
+
     for node in mol_graph.nodes:
         if mol_graph.nodes[node].get("element", "*") == "H" and\
         not mol_graph.nodes[node].get("single_h_frag", False):

diff --git a/cgsmiles/read_cgsmiles.py b/cgsmiles/read_cgsmiles.py
@@ -3,9 +3,9 @@
 import numpy as np
 import networkx as nx
 
-PATTERNS = {"bond_anchor": "\[\$.*?\]",
-            "place_holder": "\[\#.*?\]",
-            "annotation": "\|.*?\|",
+PATTERNS = {"bond_anchor": r"\[\$.*?\]",
+            "place_holder": r"\[\#.*?\]",
+            "annotation": r"\|.*?\|",
             "fragment": r'#(\w+)=((?:\[.*?\]|[^,\[\]]+)*)',
             "seq_pattern": r'\{([^}]*)\}(?:\.\{([^}]*)\})?'}
 
@@ -45,14 +45,18 @@ def _expand_branch(mol_graph, current, anchor, recipe):
             anchor = current
         for _ in range(0, n_mon):
             mol_graph.add_node(current, fragname=fragname)
-            mol_graph.add_edge(prev_node, current)
+            mol_graph.add_edge(prev_node, current, order=1)
 
             prev_node = current
             current += 1
 
     prev_node = anchor
     return mol_graph, current, prev_node
 
+def _get_percent(pattern, stop):
+    end_num = _find_next_character(pattern, ['[', ')', '(', '}'], stop)
+    return pattern[stop+1:end_num]
+
 def read_cgsmiles(pattern):
     """
     Generate a :class:`nx.Graph` from a pattern string according to the
@@ -126,7 +130,7 @@ def read_cgsmiles(pattern):
     branching = False
     # do we have an open cycle
     cycle = {}
-    cycle_edge = None
+    cycle_edges = []
     # each element in the for loop matches a pattern
     # '[' + '#' + some alphanumeric name + ']'
     for match in re.finditer(PATTERNS['place_holder'], pattern):
@@ -142,12 +146,28 @@ def read_cgsmiles(pattern):
 
         # here we check if the atom is followed by a cycle marker
         # in this case we have an open cycle and close it
-        if stop < len(pattern) and pattern[stop].isdigit() and pattern[stop] in cycle:
-            cycle_edge = (current, cycle[pattern[stop]])
-        # we open a cycle
-        elif stop < len(pattern) and pattern[stop].isdigit():
-            cycle_edge = None
-            cycle[pattern[stop]] = current
+        for token in pattern[stop:]:
+            # we close a cycle
+            if token.isdigit() and token in cycle:
+                cycle_edges.append((current, cycle[token]))
+                del cycle[token]
+            # we open a cycle
+            elif token.isdigit():
+                cycle[token] = current
+            # we found a ring indicator
+            elif token == "%":
+                ring_marker = _get_percent(pattern, stop)
+                # we close the ring
+                if ring_marker in cycle:
+                    cycle_edges.append((current, cycle[ring_marker]))
+                    del cycle[ring_marker]
+                    break
+                # we open a new ring
+                cycle[_get_percent(pattern, stop)] = current
+                break
+            else:
+                break
+
         # here we check if the atom is followed by a expansion character '|'
         # as in ... [#PEO]|
         if stop < len(pattern) and pattern[stop] == '|':
@@ -177,15 +197,21 @@ def read_cgsmiles(pattern):
             mol_graph.add_node(current, fragname=fragname)
 
             if prev_node is not None:
-                mol_graph.add_edge(prev_node, current)
+                mol_graph.add_edge(prev_node, current, order=1)
 
-            if cycle_edge:
-                mol_graph.add_edge(cycle_edge[0],
-                                   cycle_edge[1])
+            # here we have a double edge
+            for cycle_edge in cycle_edges:
+                if cycle_edge in mol_graph.edges:
+                    mol_graph.edges[cycle_edge]["order"] += 1
+                else:
+                    mol_graph.add_edge(cycle_edge[0],
+                                       cycle_edge[1],
+                                       order=1)
 
             prev_node = current
             current += 1
 
+        cycle_edges = []
         # here we check if the residue considered before is the
         # last residue of a branch (i.e. '...[#residue])'
         # that is the case if the branch closure comes before
@@ -254,4 +280,10 @@ def read_cgsmiles(pattern):
             # when all nested branches are completed
             if len(branch_anchor) == 0:
                 recipes = defaultdict(list)
+
+    # raise some errors for strange stuff
+    if cycle:
+        msg = "You have a dangling ring index."
+        raise SyntaxError(msg)
+
     return mol_graph
diff --git a/cgsmiles/read_fragments.py b/cgsmiles/read_fragments.py
@@ -38,21 +38,21 @@ def __iter__(self):
 
 def strip_bonding_descriptors(fragment_string):
     """
-    Processes a CGBigSmile fragment string by
+    Processes a CGSmiles fragment string by
     stripping the bonding descriptors and storing
     them in a dict with reference to the atom they
-    refer to. Furthermore, a cleaned SMILE or CGsmile
+    refer to. Furthermore, a cleaned SMILES or CGSmiles
     string is returned.
 
     Parameters
     ----------
     fragment_string: str
-        a CGBigsmile fragment string
+        a CGSmiles fragment string
 
     Returns
     -------
     str:
-        a canonical SMILES or CGsmiles string
+        a canonical SMILES or CGSmiles string
     dict:
         a dict mapping bonding descriptors
         to the nodes within the string
@@ -73,7 +73,7 @@ def strip_bonding_descriptors(fragment_string):
                 while peek != ']':
                     bond_descrp += peek
                     peek = next(smile_iter)
-                if smile_iter.peek() in bond_to_order:
+                if smile_iter.peek() in bond_to_order and node_count == 0:
                     order = bond_to_order[next(smile_iter)]
                 elif current_order:
                     order = current_order
@@ -87,8 +87,6 @@ def strip_bonding_descriptors(fragment_string):
                     atom += peek
                     peek = next(smile_iter)
                 smile = smile + atom + "]"
-                #if peek not in '] H @ . - = # $ : / \\ + - %'\
-                #and not token.isdigit():
                 prev_node = node_count
                 node_count += 1
 
@@ -100,12 +98,18 @@ def strip_bonding_descriptors(fragment_string):
             smile += token
         elif token in bond_to_order:
             current_order = bond_to_order[token]
-        else:
-            if token not in '] H @ $ / \\ + - %'\
-                and not token.isdigit():
-                prev_node = node_count
-                node_count += 1
             smile += token
+        elif token in '] H @ . - = # $ : / \\ + - %' or token.isdigit():
+            smile += token
+        else:
+            if smile_iter.peek() and token + smile_iter.peek() in ['Cl', 'Br', 'Si', 'Mg', 'Na']:
+                smile += (token + next(smile_iter))
+            else:
+                smile += token
+            current_order = None
+            prev_node = node_count
+            node_count += 1
+
     return smile, bonding_descrpt
 
 def fragment_iter(fragment_str, all_atom=True):
@@ -137,13 +141,12 @@ def fragment_iter(fragment_str, all_atom=True):
         fragname = fragment[1:delim]
         big_smile = fragment[delim+1:]
         smile, bonding_descrpt = strip_bonding_descriptors(big_smile)
-
         if smile == "H":
             mol_graph = nx.Graph()
             mol_graph.add_node(0, element="H", bonding=bonding_descrpt[0])
             nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding')
         elif all_atom:
-            mol_graph = pysmiles.read_smiles(smile)
+            mol_graph = pysmiles.read_smiles(smile, reinterpret_aromatic=False)
             nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding')
         # we deal with a CG resolution graph
         else:

diff --git a/cgsmiles/resolve.py b/cgsmiles/resolve.py
@@ -28,7 +28,7 @@ def compatible(left, right):
         return left[1:] == right[1:]
     return False
 
-def generate_edge(source, target, bond_attribute="bonding"):
+def match_bonding_descriptors(source, target, bond_attribute="bonding"):
     """
     Given a source and a target graph, which have bonding
     descriptors stored as node attributes, find a pair of
@@ -165,20 +165,27 @@ def edges_from_bonding_descrpt(self):
         bonding descriptors that formed the edge. Later unconsumed
         bonding descriptors are replaced by hydrogen atoms.
         """
-        for prev_node, node in nx.dfs_edges(self.meta_graph):
-            prev_graph = self.meta_graph.nodes[prev_node]['graph']
-            node_graph = self.meta_graph.nodes[node]['graph']
-            edge, bonding = generate_edge(prev_graph,
-                                          node_graph)
-
-            # remove used bonding descriptors
-            prev_graph.nodes[edge[0]]['bonding'].remove(bonding[0])
-            node_graph.nodes[edge[1]]['bonding'].remove(bonding[1])
-
-            # bonding descriptors are assumed to have bonding order 1
-            # unless they are specifically annotated
-            order = int(bonding[0][-1])
-            self.molecule.add_edge(edge[0], edge[1], bonding=bonding, order=order)
+        for prev_node, node in self.meta_graph.edges:
+            for _ in range(0, self.meta_graph.edges[(prev_node, node)]["order"]):
+                prev_graph = self.meta_graph.nodes[prev_node]['graph']
+                node_graph = self.meta_graph.nodes[node]['graph']
+                try:
+                    edge, bonding = match_bonding_descriptors(prev_graph,
+                                                              node_graph)
+                except LookupError:
+                    continue
+                # remove used bonding descriptors
+                prev_graph.nodes[edge[0]]['bonding'].remove(bonding[0])
+                node_graph.nodes[edge[1]]['bonding'].remove(bonding[1])
+
+                # bonding descriptors are assumed to have bonding order 1
+                # unless they are specifically annotated
+                order = int(bonding[0][-1])
+                self.molecule.add_edge(edge[0], edge[1], bonding=bonding, order=order)
+                if self.all_atom:
+                    for edge_node in edge:
+                        if self.molecule.nodes[edge_node]['element'] != 'H':
+                            self.molecule.nodes[edge_node]['hcount'] -= 1
 
     def squash_atoms(self):
         """