gruenewald-lab · fgrunewald · Jul 9, 2024 · Jun 24, 2024 · Jun 24, 2024 · Jul 8, 2024
diff --git a/cgsmiles/graph_utils.py b/cgsmiles/graph_utils.py
@@ -114,3 +114,17 @@ def annotate_fragments(meta_graph, molecule):
         meta_graph.nodes[meta_node]['graph'] = graph_frag
 
     return meta_graph
+
+
+def set_atom_names_atomistic(meta_graph, molecule):
+    """
+    Set atomnames according to commonly used convention
+    in molecular dynamics (MD) forcefields. This convention
+    is defined as element plus counter for atom in residue.
+    """
+    for meta_node in meta_graph.nodes:
+        fraggraph = meta_graph.nodes[meta_node]['graph']
+        for idx, node in enumerate(fraggraph.nodes):
+            atomname = fraggraph.nodes[node]['element'] + str(idx)
+            fraggraph.nodes[node]['atomname'] = atomname
+            molecule.nodes[node]['atomname'] = atomname
diff --git a/cgsmiles/read_cgsmiles.py b/cgsmiles/read_cgsmiles.py
@@ -13,7 +13,7 @@ def _find_next_character(string, chars, start):
     for idx, token in enumerate(string[start:]):
         if token in chars:
             return idx+start
-    return np.inf
+    return len(string)
 
 def _expand_branch(mol_graph, current, anchor, recipe):
     """

diff --git a/cgsmiles/resolve.py b/cgsmiles/resolve.py
@@ -1,10 +1,12 @@
 import re
 import copy
 import networkx as nx
-import pysmiles
 from .read_cgsmiles import read_cgsmiles
 from .read_fragments import read_fragments
-from .graph_utils import merge_graphs, sort_nodes_by_attr, annotate_fragments
+from .graph_utils import (merge_graphs,
+                          sort_nodes_by_attr,
+                          annotate_fragments,
+                          set_atom_names_atomistic)
 from .pysmiles_utils import rebuild_h_atoms
 
 def compatible(left, right):
@@ -69,62 +71,89 @@ def match_bonding_descriptors(source, target, bond_attribute="bonding"):
 
 class MoleculeResolver:
     """
-    Resolve the molecule described by a CGBigSmile
-    string.
+    Resolve the molecule(s) described by a
+    CGSmiles string. Each execution of the
+    resolve function will return the next
+    resolved molecule and the graph of the
+    previous resolution.
+
+    Use the resolve_iter method to loop over
+    all possible molecule resolutions enconded
+    in the CGSmiles string.
     """
-
     def __init__(self,
                  pattern,
                  meta_graph=None,
-                 fragment_dict={},
-                 all_atom=True):
-
-        # let's start by setting some attributes
-        # this is the fragment string
-        self.fragment_str = None
-        # this is the cgsmiles string
-        self.cgsmiles_str = None
+                 fragment_dicts=[],
+                 last_all_atom=True):
+
+        """
+        Parameters
+        ----------
+        pattern: str
+            the cgsmiles string to resolve
+        meta_graph: `:class:nx.Graph`
+            a potential higher resolution graph
+        fragment_dicts: list[dict[str, nx.Graph]]
+            a dict of fragment graphs per resolution
+        last_all_atom: bool
+            if the last resolution is at the all
+            atom level. If True the code will use
+            pysmiles to parse the fragments and
+            return the all-atom molecule.
+            Default: True
+        """
         # this is the dict with the fragments
         # either provided and/or extracted
         # from the fragment string
-        self.fragment_dict = fragment_dict
-        # this is the lower resolution graph
-        self.meta_graph = meta_graph
-        # this is the higher resolution graph
-        self.molecule = nx.Graph()
-        # this attribute stores if the fragments
-        # follow open_smiles syntax or cg_smiles
-        # syntax
-        self.all_atom = all_atom
+        self.fragment_dicts = fragment_dicts
+        # this is the current meta_graph
+        self.meta_graph = nx.Graph()
 
-        # here we figure out what we are dealing with
+        # here we figure out how many resolutions
+        # we are dealing with
         elements = re.findall(r"\{[^\}]+\}", pattern)
-        # case 1)
-        # a meta_graph is provided which means we only
-        # have fragments to deal with
-        if meta_graph:
-            if len(elements) > 1:
-                msg = ("When providing a meta_graph, the pattern can only"
-                       "contain fragment.")
-                raise IOError(msg)
-            else:
-                self.fragment_string = elements[0]
 
-        # case 2) we have a meta graph only described
+        # case 1) we have a meta graph only described
         # and the fragment come from elsewhere
-        elif len(elements) == 1 and self.fragment_dict:
-            self.cgsmiles_string = elements[0]
-
-        # case 3) a string containing both fragments and
-        # the meta sequence is provided
+        if len(elements) == 1 and self.fragment_dicts:
+            self.molecule = read_cgsmiles(elements[0])
+            self.fragment_strs = None
+            # the number of resolutions available
+            self.resolutions = len(self.fragment_dicts)
+            # turn the framgent strings into an iterator
+            self.fragment_strs = None
+            self.fragment_dicts = iter(self.fragment_dicts)
         else:
-            if len(elements) < 2:
-                msg = ("When providing a meta_graph, the pattern can only"
-                       "contain fragment.")
-                raise IOError(msg)
+            # case 2)
+            # a meta_graph is provided which means we only
+            # have fragments to deal with
+            if meta_graph:
+                self.fragment_strs = elements
+                self.molecule = meta_graph
+            # case 3) a string containing both fragments and
+            # the meta sequence is provided
             else:
-                self.cgsmiles_string = elements[0]
-                self.fragment_string = elements[1]
+                self.molecule = read_cgsmiles(elements[0])
+                self.fragment_strs = elements[1:]
+
+            # the number of resolutions available
+            self.resolutions = len(self.fragment_strs)
+            # turn the framgent strings into an iterator
+            self.fragment_strs = iter(self.fragment_strs)
+
+        # at this stage there are no atomnames for the nodes
+        new_names = nx.get_node_attributes(self.molecule, "fragname")
+        nx.set_node_attributes(self.meta_graph, new_names, "atomname")
+
+        # if the last resolution is all_atom
+        self.last_all_atom = last_all_atom
+
+        # what is the current resolution
+        self.resolution_counter = 0
+
+        # the next resolution fragments
+        self.fragment_dict = {}
 
     def resolve_disconnected_molecule(self):
         """
@@ -156,14 +185,23 @@ def resolve_disconnected_molecule(self):
 
             self.meta_graph.nodes[meta_node]['graph'] = graph_frag
 
-    def edges_from_bonding_descrpt(self):
+    def edges_from_bonding_descrpt(self, all_atom=False):
         """
-        Make edges according to the bonding descriptors stored
+        Makes edges according to the bonding descriptors stored
         in the node attributes of meta_molecule residue graph.
+
         If a bonding descriptor is consumed it is removed from the list,
-        however, the meta_molecule edge gets an attribute with the
-        bonding descriptors that formed the edge. Later unconsumed
-        bonding descriptors are replaced by hydrogen atoms.
+        however, the meta_graph edge gets an attribute with the
+        bonding descriptors that formed the edge.
+
+        Later unconsumed descriptors are discarded and the valence
+        filled in using hydrogen atoms in case of an atomistic molecule.
+
+        Parameters
+        ----------
+        all_atom: bool
+            if the high resolution level graph has all-atom resolution
+            default: False
         """
         for prev_node, node in self.meta_graph.edges:
             for _ in range(0, self.meta_graph.edges[(prev_node, node)]["order"]):
@@ -182,7 +220,7 @@ def edges_from_bonding_descrpt(self):
                 # unless they are specifically annotated
                 order = int(bonding[0][-1])
                 self.molecule.add_edge(edge[0], edge[1], bonding=bonding, order=order)
-                if self.all_atom:
+                if all_atom:
                     for edge_node in edge:
                         if self.molecule.nodes[edge_node]['element'] != 'H':
                             self.molecule.nodes[edge_node]['hcount'] -= 1
@@ -210,25 +248,52 @@ def squash_atoms(self):
             self.molecule.nodes[node_to_keep]['fragid'] += self.molecule.nodes[node_to_keep]['contraction'][node_to_remove]['fragid']
 
     def resolve(self):
+        """
+        Resolve a CGSmiles string once and return the next resolution.
+        """
+        # increment the resolution counter
+        self.resolution_counter += 1
+
+        # check if this is an all-atom level resolution
+        if self.resolution_counter == self.resolutions and self.last_all_atom:
+            all_atom = True
+        else:
+            all_atom = False
+
+        # get the next set of fragments
+        if self.fragment_strs:
+            fragment_str = next(self.fragment_strs)
+
+            # empty the fragment dict just as a precaution
+            self.fragment_dict = {}
+
+            # read the fragment str and populate dict
+            self.fragment_dict.update(read_fragments(fragment_str,
+                                                     all_atom=all_atom))
+        else:
+            self.fragment_dict = next(self.fragment_dicts)
+
+        # set the previous molecule as meta_graph
+        self.meta_graph = self.molecule
 
-        if self.cgsmiles_string is not None:
-            self.meta_graph = read_cgsmiles(self.cgsmiles_string)
+        # now we have to switch the node names and the fragment names
+        new_fragnames = nx.get_node_attributes(self.meta_graph, "atomname")
+        nx.set_node_attributes(self.meta_graph, new_fragnames, "fragname")
 
-        if self.fragment_string is not None:
-            self.fragment_dict.update(read_fragments(self.fragment_string,
-                                                     all_atom=self.all_atom))
+        # create an empty molecule graph
+        self.molecule = nx.Graph()
 
         # add disconnected fragments to graph
         self.resolve_disconnected_molecule()
 
         # connect valid bonding descriptors
-        self.edges_from_bonding_descrpt()
+        self.edges_from_bonding_descrpt(all_atom=all_atom)
 
         # contract atoms with squash descriptors
         self.squash_atoms()
 
         # rebuild hydrogen in all-atom case
-        if self.all_atom:
+        if all_atom:
             rebuild_h_atoms(self.molecule)
 
         # sort the atoms
@@ -238,4 +303,25 @@ def resolve(self):
         self.meta_graph = annotate_fragments(self.meta_graph,
                                              self.molecule)
 
+        # in all-atom MD there are common naming conventions
+        # that might be expected and hence we set them here
+        if all_atom:
+            set_atom_names_atomistic(self.meta_graph, self.molecule)
+
         return self.meta_graph, self.molecule
+
+    def resolve_iter(self):
+        """
+        Iterator returning all resolutions in oder.
+        """
+        for _ in range(self.resolutions):
+            meta_graph, molecule = self.resolve()
+            yield meta_graph, molecule
+
+    def resolve_all(self):
+        """
+        Resolve all layers and return final molecule
+        as well as the last layer above that.
+        """
+        *_, (meta_graph, graph) = resolver.resolve_iter()
+        return meta_graph, graph
diff --git a/cgsmiles/tests/test_layering.py b/cgsmiles/tests/test_layering.py
@@ -0,0 +1,41 @@
+import textwrap
+import networkx as nx
+import pytest
+import cgsmiles
+from cgsmiles.resolve import MoleculeResolver
+
+@pytest.mark.parametrize('cgsmiles_str, ref_strings',(
+    # simple linear
+    ("""{[#A0][#B0]}.{#A0=[#A1a][#A1b][>],
+                      #B0=[<][#B1a][#B1b]}.
+                     {#A1a=[<][#A2a]([#A2b][#A2c)[#A2d][>],
+                      #A1b=[<][#A2c][#A2d][>],
+                      #B1a=[<][#B2a][#B2b][>],
+                      #B1b=[<][#B2c][>]([#B2d]1[#B2e][#B2f]1)}""",
+     ["{[#A1a][#A1b][#B1a][#B1b]}",
+      "{[#A2a]([#A2b][#A2c])[#A2d][#A2c][#A2d][#B2a][#B2b][#B2c]([#B2d]1[#B2e][#B2f]1)}"]
+    ),
+    # linear with squash
+    ("""{[#A0][#B0]}.{#A0=[!][#A1a][#A1b][>],
+                      #B0=[!][#A1a][#B1b]}.
+                     {#A1a=[<][#A2a]([#A2b][#A2c)[#A2c][!],
+                      #A1b=[!][#A2c][#A2d][>],
+                      #B1b=[<][#B2c][>]([#B2d]1[#B2e][#B2f]1)}""",
+    ["{[#A1a][#A1b][#B1b]}",
+     "{[#A2a]([#A2b][#A2c)[#A2c][#A2d][#B2c]([#B2d]1[#B2e][#B2f]1)}"],),
+    # cycle layering
+    ("""{[#A0]1[#A0][#A0]1}.{#A0=[>][#A1a][#A1b][<]}.
+        {#A1a=[>][#A2a][#A2b][#A2c][<],#A1b=[<][#A2e][>]([#C][#D])}""",
+    ["{[#A1a]1[#A1b][#A1a][#A1b][#A1a][#A1b]1}",
+     "{[#A2a]1[#A2b][#A2c][#A2e]([#C][#D])[#A2e]([#C][#D])[#A2a][#A2b][#A2c][#A2e]([#C][#D])[#A2a][#A2b][#A2c][#A2e]1([#C][#D])"]
+)))
+def test_layering_of_resolutions(cgsmiles_str, ref_strings):
+
+    def _node_match(n1, n2):
+        return n1["fragname"] == n2["fragname"]
+
+    cgsmiles_str = cgsmiles_str.strip().replace('\n','').replace(' ','')
+    resolver = MoleculeResolver(cgsmiles_str, last_all_atom=False)
+    for (low_graph, high_graph), ref_str in zip(resolver.resolve_iter(), ref_strings):
+        ref_graph = cgsmiles.read_cgsmiles(ref_str)
+        nx.is_isomorphic(ref_graph, high_graph, node_match=_node_match)