gruenewald-lab · fgrunewald · Apr 14, 2024 · Apr 14, 2024 · Apr 14, 2024 · Apr 14, 2024
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,19 @@
+**/__pycache__
+**.pyc
+**.bak
+\#*
+
+**.lprof
+
+.eggs/**
+*.swp
+.pytest_cache
+*.egg-info
+.coverage
+.hypothesis
+
+htmlcov
+
+docs/build
+docs/source/api
+docs/source/.doctrees
diff --git a/README.md b/README.md
diff --git a/README.rst b/README.rst
@@ -0,0 +1,87 @@
+================================
+Coarse-Grained SMILES (CGsmiles)
+================================
+
+Overview
+========
+
+CGSmiles is a line notation syntax to describe arbitrarily complex
+graphs - usually molecules - using a line notation. In contrast to
+existing (add link to BigSmiles, CurlySmiles ...) line notations it
+applies to any resolution (e.g. coarse-grained models) and allows to
+connect multiple resolutions. Therefore, it serves as a short, 
+easy-to-read, and efficient notation for molecules and describes 
+transformations between resolutions. In addition it's syntax borrows
+some features from the BigSmiles notation that makes expressing large
+polymeric molecules simpler.
+
+The CGSmiles Python package is created around this notation to read and
+resolve molecules yielding networkx graphs, which represent the
+different resolutions expressed in the CGSmiles string.
+
+CGSmiles are also implemented in the polyply package, which allows
+users to generate coordinates of complex polymers using CGsmiles.
+
+Resources
+=========
+
+- here go some resources
+
+Related Tools
+=============
+
+- `polyply <https://github.com/marrink-lab/polyply_1.0>`__:
+Generate topology files and coordinates for molecular dynamics (MD)
+from CGSmiles notation.
+
+- `fast_forward <https://github.com/fgrunewald/fast_forward>`__:
+Forward map trajectories from a high to lower resolution using
+CGSmiles.
+
+Citation
+========
+
+When using **cgsmiles** to for your publication, please:
+
+
+Installation
+============
+
+The easiest ways to install **cgsmiles** are using pip:
+
+.. code:: bash
+
+   pip install cgsmiles
+
+Alternatively install the master branch directly from GitHub:
+
+.. code:: bash
+
+   pip install git+https://github.com/gruenewald-lab/CGsmiles.git
+
+Examples
+========
+
+The cgsmiles python package is designed to read and resolve these smiles
+into networkx graphs that can be used for further operations.
+
+.. code:: python
+
+   import matplotlib.pyplot as plt
+   import networkx as nx
+   import cgsmiles
+
+   # Express 5 units of Polystyrene in CGSmiles
+   cgsmiles_str = "{[#PS]|5}.{#PS=[$]CC[$](c1ccccc1)}"
+
+   # Resolve molecule into networkx graphs
+   res_graph, mol_graph = cgsmiles.MoleculeResolver(cgsmiles_str).resolve()
+
+   # Draw molecule at different resolutions
+   for g in [res_graph, mol_graph]:
+      nx.draw_networkx(g)
+      plt.show()
+
+   # Get fragment corresponding to first residue
+   fragment_1 = res_graph.nodes[0]['graph']
+
diff --git a/cgsmiles/graph_utils.py b/cgsmiles/graph_utils.py
@@ -164,8 +164,11 @@ def set_atom_names_atomistic(molecule, meta_graph=None):
     fraglist = defaultdict(list)
     if meta_graph:
         for meta_node in meta_graph.nodes:
-            fraggraph = meta_graph.nodes[meta_node]['graph']
-            fraglist[meta_node] += list(fraggraph.nodes)
+            # to catch virtual side nodes that do not have a representation
+            # in the atomsitic structure
+            fraggraph = meta_graph.nodes[meta_node].get('graph', None)
+            if fraggraph:
+                fraglist[meta_node] += list(fraggraph.nodes)
     else:
         node_to_fragid = nx.get_node_attributes(molecule, 'fragid')
         for node, fragids in node_to_fragid.items():

diff --git a/cgsmiles/read_cgsmiles.py b/cgsmiles/read_cgsmiles.py
@@ -31,32 +31,29 @@ def _expand_branch(mol_graph, current, anchor, recipe):
     anchor: abc.hashable
         anchor to which to connect current node
 
-    recipie: list[(str, int)]
+    recipe: list[(str, int, int)]
         list storing tuples of node names and
         the number of times the node has to be added
+        and their bond order
 
     Returns
     -------
     nx.Graph
     """
     prev_node = anchor
-    for bdx, (fragname, n_mon) in enumerate(recipe):
+    for bdx, (fragname, n_mon, order) in enumerate(recipe):
         if bdx == 0:
             anchor = current
         for _ in range(0, n_mon):
             mol_graph.add_node(current, fragname=fragname)
-            mol_graph.add_edge(prev_node, current, order=1)
+            mol_graph.add_edge(prev_node, current, order=order)
 
             prev_node = current
             current += 1
 
     prev_node = anchor
     return mol_graph, current, prev_node
 
-def _get_percent(pattern, stop):
-    end_num = _find_next_character(pattern, ['[', ')', '(', '}'], stop)
-    return pattern[stop+1:end_num]
-
 def read_cgsmiles(pattern):
     """
     Generate a :class:`nx.Graph` from a pattern string according to the
@@ -133,6 +130,10 @@ def read_cgsmiles(pattern):
     cycle_edges = []
     # each element in the for loop matches a pattern
     # '[' + '#' + some alphanumeric name + ']'
+    symbol_to_order = {".": 0, "=": 2, "-": 1, "#": 3, "$": 4}
+    default_bond_order = 1
+    bond_order = None
+    prev_bond_order = None
     for match in re.finditer(PATTERNS['place_holder'], pattern):
         start, stop = match.span()
         # we start a new branch when the residue is preceded by '('
@@ -142,32 +143,61 @@ def read_cgsmiles(pattern):
             branch_anchor.append(prev_node)
             # the recipe for making the branch includes the anchor;
             # which is hence the first residue in the list
-            recipes[branch_anchor[-1]] = [(mol_graph.nodes[prev_node]['fragname'], 1)]
+            # at this point the bond order is still 1 unless we have an expansion
+            recipes[branch_anchor[-1]] = [(mol_graph.nodes[prev_node]['fragname'], 1, 1)]
 
         # here we check if the atom is followed by a cycle marker
         # in this case we have an open cycle and close it
-        for token in pattern[stop:]:
-            # we close a cycle
-            if token.isdigit() and token in cycle:
-                cycle_edges.append((current, cycle[token]))
-                del cycle[token]
-            # we open a cycle
-            elif token.isdigit():
-                cycle[token] = current
-            # we found a ring indicator
-            elif token == "%":
-                ring_marker = _get_percent(pattern, stop)
-                # we close the ring
+        ring_marker = ""
+        multi_ring = False
+        ring_bond_order = default_bond_order
+        for rdx, token in enumerate(pattern[stop:]):
+            if multi_ring and not token.isdigit():
+                ring_marker = int(ring_marker[1:])
                 if ring_marker in cycle:
-                    cycle_edges.append((current, cycle[ring_marker]))
+                    cycle_edges.append((current,
+                                        cycle[ring_marker][0],
+                                        cycle[ring_marker][1]))
                     del cycle[ring_marker]
-                    break
-                # we open a new ring
-                cycle[_get_percent(pattern, stop)] = current
-                break
+                else:
+                    cycle[ring_marker] = [current, ring_bond_order]
+                multi_ring = False
+                ring_marker = ""
+                ring_bond_order = default_bond_order
+
+            # we open a new multi ring
+            if token == "%":
+                multi_ring = True
+                ring_marker = '%'
+            # we open a ring or close
+            elif token.isdigit():
+                ring_marker += token
+                if not multi_ring:
+                    ring_marker = int(ring_marker)
+                    # we have a single digit marker and it is in
+                    # cycle so we close it
+                    if ring_marker in cycle:
+                        cycle_edges.append((current,
+                                            cycle[ring_marker][0],
+                                            cycle[ring_marker][1]))
+                        del cycle[ring_marker]
+                    # the marker is not in cycle so we update cycles
+                    else:
+                        cycle[ring_marker] = [current, ring_bond_order]
+                    ring_marker = ""
+                    ring_bond_order = default_bond_order
+            # we found bond_order
+            elif token in symbol_to_order:
+                ring_bond_order = symbol_to_order[token]
             else:
                 break
 
+        # check if there is a bond-order following the node
+        if stop < len(pattern) and pattern[stop+rdx-1] in '- + . = # $':
+            bond_order = symbol_to_order[pattern[stop+rdx-1]]
+        else:
+            bond_order = default_bond_order
+
         # here we check if the atom is followed by a expansion character '|'
         # as in ... [#PEO]|
         if stop < len(pattern) and pattern[stop] == '|':
@@ -185,28 +215,41 @@ def read_cgsmiles(pattern):
         # the fragname starts at the second character and ends
         # one before the last according to the above pattern
         fragname = match.group(0)[2:-1]
+        # check for charge
+        charge = 0.0
+        for sign in ["+", "-"]:
+            if sign in fragname:
+                fragname, charge = fragname.split(sign)
+                if len(charge) == 0:
+                    charge = float(sign+"1")
+                else:
+                    charge = float(sign+charge)
+
         # if this residue is part of a branch we store it in
         # the recipe dict together with the anchor residue
         # and expansion number
         if branching:
-            recipes[branch_anchor[-1]].append((fragname, n_mon))
+            recipes[branch_anchor[-1]].append((fragname, n_mon, prev_bond_order))
 
         # new we add new residue as often as required
         connection = []
         for _ in range(0, n_mon):
-            mol_graph.add_node(current, fragname=fragname)
+            mol_graph.add_node(current, fragname=fragname, charge=charge)
 
             if prev_node is not None:
-                mol_graph.add_edge(prev_node, current, order=1)
+                mol_graph.add_edge(prev_node, current, order=prev_bond_order)
+
+            prev_bond_order = bond_order
 
             # here we have a double edge
             for cycle_edge in cycle_edges:
                 if cycle_edge in mol_graph.edges:
-                    mol_graph.edges[cycle_edge]["order"] += 1
-                else:
-                    mol_graph.add_edge(cycle_edge[0],
-                                       cycle_edge[1],
-                                       order=1)
+                    msg=("You define two edges between the same node."
+                         "Use bond order symbols instead.")
+                    raise SyntaxError(msg)
+                mol_graph.add_edge(cycle_edge[0],
+                                   cycle_edge[1],
+                                   order=cycle_edge[2])
 
             prev_node = current
             current += 1
@@ -236,12 +279,19 @@ def read_cgsmiles(pattern):
             eon_a = _find_next_character(pattern, [')'], stop)
             # Then we check if the expansion character
             # is next.
-            if eon_a+1 < len(pattern) and pattern[eon_a+1] == "|":
+            if (eon_a+1 < len(pattern) and pattern[eon_a+1] == "|") or\
+               (eon_a+2 < len(pattern) and pattern[eon_a+2] == "|"):
+                if pattern[eon_a+2] == "|":
+                    anchor_order = symbol_to_order[pattern[eon_a+1]]
+                    recipe = recipes[prev_node][0]
+                    recipes[prev_node][0] = (recipe[0], recipe[1], anchor_order)
+                    eon_a += 1
                 # If there is one we find the beginning
                 # of the next branch, residue or end of the string
                 # As before all characters inbetween are a number that
                 # is how often the branch is expanded.
-                eon_b = _find_next_character(pattern, ['[', ')', '(', '}'], eon_a+1)
+                next_characters = ['[', ')', '(', '}'] + list(symbol_to_order.keys())
+                eon_b = _find_next_character(pattern, next_characters, eon_a+1)
                 # the outermost loop goes over how often a the branch has to be
                 # added to the existing sequence
                 for idx in range(0,int(pattern[eon_a+2:eon_b])-1):
@@ -276,6 +326,13 @@ def read_cgsmiles(pattern):
                         prev_anchor = ref_anchor
                 # all branches added; then go back to the base anchor
                 prev_node = base_anchor
+            #================================================
+            #     bond orders for after branches            #
+            #================================================
+                if pattern[eon_b] in symbol_to_order:
+                    prev_bond_order = symbol_to_order[pattern[eon_b]]
+            elif eon_a+1 < len(pattern) and pattern[eon_a+1] in symbol_to_order:
+                prev_bond_order = symbol_to_order[pattern[eon_a+1]]
             # if all branches are done we need to reset the lists
             # when all nested branches are completed
             if len(branch_anchor) == 0:

diff --git a/cgsmiles/resolve.py b/cgsmiles/resolve.py
@@ -1,5 +1,6 @@
 import re
 import copy
+import numpy as np
 import networkx as nx
 from .read_cgsmiles import read_cgsmiles
 from .read_fragments import read_fragments
@@ -239,6 +240,15 @@ def resolve_disconnected_molecule(self, fragment_dict):
         """
         for meta_node in self.meta_graph.nodes:
             fragname = self.meta_graph.nodes[meta_node]['fragname']
+
+            # we have a virtual node and skip it
+            if fragname not in fragment_dict:
+                neighbors = self.meta_graph.neighbors(meta_node)
+                orders = [self.meta_graph.edges[(meta_node, neigh)]["order"] for neigh in neighbors]
+                if not all(np.array(orders) == 0):
+                    raise SyntaxError(f"Found node #{fragname} but no corresponding fragment.")
+                continue
+
             fragment = fragment_dict[fragname]
             correspondence = merge_graphs(self.molecule, fragment)
 

diff --git a/cgsmiles/sample.py b/cgsmiles/sample.py
@@ -59,7 +59,7 @@ class MoleculeSampler:
     descriptor is used to highlight the fact that the probabilistic behaviour is driven
     by the bonding descriptors.
 
-     The sampler features a two level connectivity determination. First using the
+    The sampler features a two level connectivity determination. First using the
     `polymer_reactivities` an open bonding descriptor from the growing polymer
     molecule is selected according to the probabilities provided. Subsequently,
     a fragment is randomly chosen that has a matching complementary bonding