gruenewald-lab · fgrunewald · Sep 18, 2024 · Jul 9, 2024 · Jul 9, 2024 · Jul 10, 2024
diff --git a/cgsmiles/__init__.py b/cgsmiles/__init__.py
@@ -5,3 +5,4 @@
 from .read_cgsmiles import read_cgsmiles
 from .read_fragments import read_fragments
 from .resolve import MoleculeResolver
+from .sample import MoleculeSampler
diff --git a/cgsmiles/cgsmiles_utils.py b/cgsmiles/cgsmiles_utils.py
@@ -0,0 +1,66 @@
+from collections import defaultdict
+import networkx as nx
+
+def find_complementary_bonding_descriptor(bonding_descriptor, ellegible_descriptors=None):
+    """
+    Given a bonding descriptor find the complementary match.
+    In the case of '$' prefixed descriptors this is just
+    the same and '>' or '<' get flipped to the other
+    symbol.
+
+    Parameters
+    ----------
+    bonding_descriptor: str
+    ellegible_descriptors: list[str]
+        a list of allowed descriptors to match
+
+    Return
+    ------
+    list[str]
+    """
+    compl = []
+    if bonding_descriptor[0] == '$' and ellegible_descriptors:
+        for descriptor in ellegible_descriptors:
+            if descriptor[0] == '$' and descriptor[-1] == bonding_descriptor[-1]:
+                compl.append(descriptor)
+        return compl
+
+    if bonding_descriptor[0] == '<':
+        compl = '>' + bonding_descriptor[1:]
+    elif bonding_descriptor[0] == '>':
+        compl = '<' + bonding_descriptor[1:]
+    else:
+        compl = bonding_descriptor
+
+    if compl not in ellegible_descriptors:
+        msg = ("Bonding descriptor {compl} was not found in list of potential"
+               "matching descriptors.")
+        raise IOError(msg.format(compl=compl))
+
+    return [compl]
+
+def find_open_bonds(molecule, target_nodes=None):
+    """
+    Collect all nodes which have an open bonding descriptor
+    and store them as keys with a list of nodes as values.
+
+    Parameters
+    ----------
+    molecule: nx.Graph
+    target_nodes: list[abc.hashable]
+        a list of node keys matching molecule
+
+    Return
+    ------
+    dict
+    """
+    if target_nodes is None:
+        target_nodes = molecule
+
+    open_bonds_by_descriptor = defaultdict(list)
+    open_bonds = nx.get_node_attributes(molecule, 'bonding')
+    for node, bonding_types in open_bonds.items():
+        if node in target_nodes:
+            for bonding_types in bonding_types:
+                open_bonds_by_descriptor[bonding_types].append(node)
+    return open_bonds_by_descriptor
diff --git a/cgsmiles/graph_utils.py b/cgsmiles/graph_utils.py
@@ -42,7 +42,7 @@ def merge_graphs(source_graph, target_graph, max_node=None):
     correspondence = {}
     for idx, node in enumerate(target_graph.nodes(), start=offset + 1):
         correspondence[node] = idx
-        new_atom = copy.copy(target_graph.nodes[node])
+        new_atom = copy.deepcopy(target_graph.nodes[node])
         new_atom['fragid'] = [(new_atom.get('fragid', 0) + fragment_offset)]
         source_graph.add_node(idx, **new_atom)
 
@@ -116,15 +116,32 @@ def annotate_fragments(meta_graph, molecule):
     return meta_graph
 
 
-def set_atom_names_atomistic(meta_graph, molecule):
+def set_atom_names_atomistic(molecule, meta_graph=None):
     """
     Set atomnames according to commonly used convention
     in molecular dynamics (MD) forcefields. This convention
     is defined as element plus counter for atom in residue.
+
+    Parameters
+    ----------
+    molecule: nx.Graph
+        the molecule for which to adjust the atomnames
+    meta_graph: nx.Graph
+        optional; get the fragments from the meta_graph
+        attributes which is faster in some cases
     """
-    for meta_node in meta_graph.nodes:
-        fraggraph = meta_graph.nodes[meta_node]['graph']
-        for idx, node in enumerate(fraggraph.nodes):
-            atomname = fraggraph.nodes[node]['element'] + str(idx)
-            fraggraph.nodes[node]['atomname'] = atomname
+    fraglist = defaultdict(list)
+    if meta_graph:
+        for meta_node in meta_graph.nodes:
+            fraggraph = meta_graph.nodes[meta_node]['graph']
+            fraglist[meta_node] += list(fraggraph.nodes)
+    else:
+        node_to_fragid = nx.get_node_attributes(molecule, 'fragid')
+        for node, fragids in node_to_fragid.items():
+            assert len(fragids) == 1
+            fraglist[fragids[0]].append(node)
+
+    for fragnodes in fraglist.values():
+        for idx, node in enumerate(fragnodes):
+            atomname = molecule.nodes[node]['element'] + str(idx)
             molecule.nodes[node]['atomname'] = atomname
diff --git a/cgsmiles/pysmiles_utils.py b/cgsmiles/pysmiles_utils.py
@@ -1,6 +1,30 @@
 import networkx as nx
 import pysmiles
 
+def compute_mass(input_molecule):
+    """
+    Compute the mass of a molecule from the PTE.
+
+    Parameters
+    ----------
+    molecule: nx.Graph
+        molecule which must have element specified per node
+
+    Returns
+    -------
+    float
+        the atomic mass
+    """
+    molecule = input_molecule.copy()
+    # we need to add the hydrogen atoms
+    # for computing the mass
+    rebuild_h_atoms(molecule)
+    mass = 0
+    for node in molecule.nodes:
+        element = molecule.nodes[node]['element']
+        mass += pysmiles.PTE[element]['AtomicMass']
+    return mass
+
 def rebuild_h_atoms(mol_graph, keep_bonding=False):
     """
     Helper function which add hydrogen atoms to the molecule graph.

diff --git a/cgsmiles/read_fragments.py b/cgsmiles/read_fragments.py
@@ -151,7 +151,6 @@ def fragment_iter(fragment_str, all_atom=True):
         # we deal with a CG resolution graph
         else:
             mol_graph = read_cgsmiles(smile)
-            nx.set_node_attributes(mol_graph, 1, 'fragid')
             fragnames = nx.get_node_attributes(mol_graph, 'fragname')
             nx.set_node_attributes(mol_graph, fragnames, 'atomname')
             nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding')
@@ -161,6 +160,7 @@ def fragment_iter(fragment_str, all_atom=True):
             nx.set_node_attributes(mol_graph, atomnames, 'atomname')
 
         nx.set_node_attributes(mol_graph, fragname, 'fragname')
+        nx.set_node_attributes(mol_graph, 0, 'fragid')
         yield fragname, mol_graph
 
 def read_fragments(fragment_str, all_atom=True, fragment_dict=None):

diff --git a/cgsmiles/resolve.py b/cgsmiles/resolve.py
@@ -9,28 +9,39 @@
                           set_atom_names_atomistic)
 from .pysmiles_utils import rebuild_h_atoms
 
-def compatible(left, right):
+def compatible(left, right, legacy=False):
     """
     Check bonding descriptor compatibility according
-    to the BigSmiles syntax conventions.
+    to the CGSmiles syntax conventions. With legacy
+    the BigSmiles convention can be used.
 
     Parameters
     ----------
     left: str
     right: str
+    legacy: bool
 
     Returns
     -------
     bool
     """
-    if left == right and left[0] not in '> <':
-        return True
-    l, r = left[0], right[0]
-    if (l, r) == ('<', '>') or (l, r) == ('>', '<'):
-        return left[1:] == right[1:]
-    return False
-
-def match_bonding_descriptors(source, target, bond_attribute="bonding"):
+    if legacy:
+        if left == right and left[0] not in '> <':
+            return True
+        l, r = left[0], right[0]
+        if (l, r) == ('<', '>') or (l, r) == ('>', '<'):
+            return left[1:] == right[1:]
+        return False
+    else:
+        if left[0] == right[0] == '$' or left[0] == right[0] == '!':
+            return True
+
+        l, r = left[0], right[0]
+        if (l, r) == ('<', '>') or (l, r) == ('>', '<'):
+            return True
+        return False
+
+def match_bonding_descriptors(source, target, bond_attribute="bonding", legacy=False):
     """
     Given a source and a target graph, which have bonding
     descriptors stored as node attributes, find a pair of
@@ -46,6 +57,9 @@ def match_bonding_descriptors(source, target, bond_attribute="bonding"):
     bond_attribute: `abc.hashable`
         under which attribute are the bonding descriptors
         stored.
+    legacy: bool
+        which syntax convention to use when matching bonding
+        descriptors (legacy=BigSmiles)
 
     Returns
     -------
@@ -65,7 +79,7 @@ def match_bonding_descriptors(source, target, bond_attribute="bonding"):
             bond_targets = target_nodes[target_node]
             for bond_source in bond_sources:
                 for bond_target in bond_targets:
-                    if compatible(bond_source, bond_target):
+                    if compatible(bond_source, bond_target, legacy=legacy):
                         return ((source_node, target_node), (bond_source, bond_target))
     raise LookupError
 
@@ -141,7 +155,8 @@ class MoleculeResolver:
     def __init__(self,
                  molecule_graph,
                  fragment_dicts,
-                 last_all_atom=True):
+                 last_all_atom=True,
+                 legacy=False):
 
         """
         Parameters
@@ -158,6 +173,16 @@ def __init__(self,
             if the last resolution is at the all atom level. If True the code
             will use pysmiles to parse the fragments and return the all-atom
             molecule. Default: True
+        legacy: bool
+            which syntax convention to use for matching the bonding descriptors.
+            Legacy syntax adheres to the BigSmiles convention. Default syntax
+            adheres to CGSmiles convention where bonding descriptors '$' match
+            with every '$' and every '<' matches every '>'. With the BigSmiles
+            convention a alphanumeric string may be provided that distinguishes
+            these connectors. For example, '$A' would not match '$B'. However,
+            such use cases should be rare and the CGSmiles convention facilitates
+            usage of bonding descriptors in the Sampler where the labels are used
+            to assign different probabilities.
         """
         self.meta_graph = nx.Graph()
         self.fragment_dicts = fragment_dicts
@@ -167,6 +192,7 @@ def __init__(self,
         self.resolutions = len(self.fragment_dicts)
         new_names = nx.get_node_attributes(self.molecule, "fragname")
         nx.set_node_attributes(self.meta_graph, new_names, "atomname")
+        self.legacy = legacy
 
     @staticmethod
     def read_fragment_strings(fragment_strings, last_all_atom=True):
@@ -256,7 +282,8 @@ def edges_from_bonding_descrpt(self, all_atom=False):
                 node_graph = self.meta_graph.nodes[node]['graph']
                 try:
                     edge, bonding = match_bonding_descriptors(prev_graph,
-                                                              node_graph)
+                                                              node_graph,
+                                                              legacy=self.legacy)
                 except LookupError:
                     continue
                 # remove used bonding descriptors
@@ -337,7 +364,7 @@ def resolve(self):
         # in all-atom MD there are common naming conventions
         # that might be expected and hence we set them here
         if all_atom:
-            set_atom_names_atomistic(self.meta_graph, self.molecule)
+            set_atom_names_atomistic(self.molecule, self.meta_graph)
 
         # increment the resolution counter
         self.resolution_counter += 1
@@ -361,7 +388,7 @@ def resolve_all(self):
         return meta_graph, graph
 
     @classmethod
-    def from_string(cls, cgsmiles_str, last_all_atom=True):
+    def from_string(cls, cgsmiles_str, last_all_atom=True, legacy=False):
         """
         Initiate a MoleculeResolver instance from a cgsmiles string.
 
@@ -370,6 +397,11 @@ def from_string(cls, cgsmiles_str, last_all_atom=True):
         cgsmiles_str: str
         last_all_atom: bool
             if the last resolution is all-atom and is read using pysmiles
+        legacy: bool
+            which syntax convention to use for matching the bonding descriptors.
+            Legacy syntax adheres to the BigSmiles convention. Default syntax
+            adheres to CGSmiles convention. A more detailed explanation can be
+            found in the :func:`~resolve.MoleculeResolver.__init__` method.
 
         Returns
         -------
@@ -384,11 +416,12 @@ def from_string(cls, cgsmiles_str, last_all_atom=True):
                                                    last_all_atom=last_all_atom)
         resolver_obj = cls(molecule_graph=molecule,
                            fragment_dicts=fragment_dicts,
-                           last_all_atom=last_all_atom)
+                           last_all_atom=last_all_atom,
+                           legacy=legacy)
         return resolver_obj
 
     @classmethod
-    def from_graph(cls, cgsmiles_str, meta_graph, last_all_atom=True):
+    def from_graph(cls, cgsmiles_str, meta_graph, last_all_atom=True, legacy=False):
         """
         Initiate a MoleculeResolver instance from a cgsmiles string
         and a `meta_graph` that describes the lowest resolution.
@@ -401,6 +434,11 @@ def from_graph(cls, cgsmiles_str, meta_graph, last_all_atom=True):
             fragname attribute set.
         last_all_atom: bool
             if the last resolution is all-atom and is read using pysmiles
+        legacy: bool
+            which syntax convention to use for matching the bonding descriptors.
+            Legacy syntax adheres to the BigSmiles convention. Default syntax
+            adheres to CGSmiles convention. A more detailed explanation can be
+            found in the :func:`~resolve.MoleculeResolver.__init__` method.
 
         Returns
         -------
@@ -417,12 +455,13 @@ def from_graph(cls, cgsmiles_str, meta_graph, last_all_atom=True):
 
         resolver_obj = cls(molecule_graph=meta_graph,
                            fragment_dicts=fragment_dicts,
-                           last_all_atom=last_all_atom)
+                           last_all_atom=last_all_atom,
+                           legacy=legacy)
 
         return resolver_obj
 
     @classmethod
-    def from_fragment_dicts(cls, cgsmiles_str, fragment_dicts, last_all_atom=True):
+    def from_fragment_dicts(cls, cgsmiles_str, fragment_dicts, last_all_atom=True, legacy=False):
         """
         Initiate a MoleculeResolver instance from a cgsmiles string, describing
         one molecule and fragment_dicts containing fragments for each resolution.
@@ -436,6 +475,11 @@ def from_fragment_dicts(cls, cgsmiles_str, fragment_dicts, last_all_atom=True):
             function.
         last_all_atom: bool
             if the last resolution is all-atom and is read using pysmiles
+        legacy: bool
+            which syntax convention to use for matching the bonding descriptors.
+            Legacy syntax adheres to the BigSmiles convention. Default syntax
+            adheres to CGSmiles convention. A more detailed explanation can be
+            found in the :func:`~resolve.MoleculeResolver.__init__` method.
 
         Returns
         -------
@@ -451,5 +495,6 @@ def from_fragment_dicts(cls, cgsmiles_str, fragment_dicts, last_all_atom=True):
         molecule = read_cgsmiles(elements[0])
         resolver_obj = cls(molecule_graph=molecule,
                            fragment_dicts=fragment_dicts,
-                           last_all_atom=last_all_atom)
+                           last_all_atom=last_all_atom,
+                           legacy=legacy)
         return resolver_obj