gruenewald-lab · fgrunewald · Sep 18, 2024 · Jul 9, 2024 · Jul 9, 2024 · Jul 10, 2024
diff --git a/cgsmiles/__init__.py b/cgsmiles/__init__.py
@@ -5,3 +5,4 @@
 from .read_cgsmiles import read_cgsmiles
 from .read_fragments import read_fragments
 from .resolve import MoleculeResolver
+from .sample import MoleculeSampler
diff --git a/cgsmiles/cgsmiles_utils.py b/cgsmiles/cgsmiles_utils.py
@@ -16,14 +16,18 @@ def find_complementary_bonding_descriptor(bonding_descriptor):
         compl = bonding_descriptor
     return compl
 
-def find_open_bonds(molecule):
+def find_open_bonds(molecule, target_nodes=None):
     """
     Collect all nodes which have an open bonding descriptor and store
     them as keys with a list of nodes as values.
     """
+    if target_nodes is None:
+        target_nodes = list(molecule.nodes)
+
     open_bonds_by_descriptor = defaultdict(list)
     open_bonds = nx.get_node_attributes(molecule, 'bonding')
     for node, bonding_types in open_bonds.items():
-        for bonding_types in bonding_types:
-            open_bonds_by_descriptor[bonding_types].append(node)
+        if node in target_nodes:
+            for bonding_types in bonding_types:
+                open_bonds_by_descriptor[bonding_types].append(node)
     return open_bonds_by_descriptor
diff --git a/cgsmiles/graph_utils.py b/cgsmiles/graph_utils.py
@@ -116,15 +116,32 @@ def annotate_fragments(meta_graph, molecule):
     return meta_graph
 
 
-def set_atom_names_atomistic(meta_graph, molecule):
+def set_atom_names_atomistic(molecule, meta_graph=None):
     """
     Set atomnames according to commonly used convention
     in molecular dynamics (MD) forcefields. This convention
     is defined as element plus counter for atom in residue.
+
+    Parameters
+    ----------
+    molecule: nx.Graph
+        the molecule for which to adjust the atomnames
+    meta_graph: nx.Graph
+        optional; get the fragments from the meta_graph
+        attributes which is faster in some cases
     """
-    for meta_node in meta_graph.nodes:
-        fraggraph = meta_graph.nodes[meta_node]['graph']
-        for idx, node in enumerate(fraggraph.nodes):
-            atomname = fraggraph.nodes[node]['element'] + str(idx)
-            fraggraph.nodes[node]['atomname'] = atomname
+    fraglist = defaultdict(list)
+    if meta_graph:
+        for meta_node in meta_graph.nodes:
+            fraggraph = meta_graph.nodes[meta_node]['graph']
+            fraglist[meta_node] += list(fraggraph.nodes)
+    else:
+        node_to_fragid = nx.get_node_attributes(molecule, 'fragid')
+        for node, fragids in node_to_fragid.items():
+            assert len(fragids) == 1
+            fraglist[fragids[0]].append(node)
+
+    for fragnodes in fraglist.values():
+        for idx, node in enumerate(fragnodes):
+            atomname = molecule.nodes[node]['element'] + str(idx)
             molecule.nodes[node]['atomname'] = atomname
diff --git a/cgsmiles/pysmiles_utils.py b/cgsmiles/pysmiles_utils.py
@@ -1,7 +1,7 @@
 import networkx as nx
 import pysmiles
 
-def compute_mass(molecule):
+def compute_mass(input_molecule):
     """
     Compute the mass of a molecule from the PTE.
 
@@ -15,6 +15,11 @@ def compute_mass(molecule):
     float
         the atomic mass
     """
+    molecule = input_molecule.copy()
+    print(molecule.nodes(data=True))
+    # we need to add the hydrogen atoms
+    # for computing the mass
+    rebuild_h_atoms(molecule)
     mass = 0
     for node in molecule.nodes:
         element = molecule.nodes[node]['element']

diff --git a/cgsmiles/read_fragments.py b/cgsmiles/read_fragments.py
@@ -151,7 +151,6 @@ def fragment_iter(fragment_str, all_atom=True):
         # we deal with a CG resolution graph
         else:
             mol_graph = read_cgsmiles(smile)
-            nx.set_node_attributes(mol_graph, 1, 'fragid')
             fragnames = nx.get_node_attributes(mol_graph, 'fragname')
             nx.set_node_attributes(mol_graph, fragnames, 'atomname')
             nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding')
@@ -161,6 +160,7 @@ def fragment_iter(fragment_str, all_atom=True):
             nx.set_node_attributes(mol_graph, atomnames, 'atomname')
 
         nx.set_node_attributes(mol_graph, fragname, 'fragname')
+        nx.set_node_attributes(mol_graph, 0, 'fragid')
         yield fragname, mol_graph
 
 def read_fragments(fragment_str, all_atom=True, fragment_dict=None):

diff --git a/cgsmiles/resolve.py b/cgsmiles/resolve.py
@@ -337,7 +337,7 @@ def resolve(self):
         # in all-atom MD there are common naming conventions
         # that might be expected and hence we set them here
         if all_atom:
-            set_atom_names_atomistic(self.meta_graph, self.molecule)
+            set_atom_names_atomistic(self.molecule, self.meta_graph)
 
         # increment the resolution counter
         self.resolution_counter += 1

diff --git a/cgsmiles/sample.py b/cgsmiles/sample.py
@@ -17,12 +17,13 @@ class MoleculeSampler:
     """
     def __init__(self,
                  fragment_dict,
-                 target_weight,
                  bonding_probabilities,
+                 branch_term_probs=None,
+                 terminal_fragments=[],
+                 bond_term_probs=None,
                  fragment_masses=None,
-                 termination_probabilities=None,
-                 start=None,
-                 all_atom=True):
+                 all_atom=True,
+                 seed=None):
 
         """
         Parameters
@@ -39,31 +40,31 @@ def __init__(self,
             masses of the molecule fragments; if all_atom is True
             these can be left out and are automatically computed from
             the element masses
-        termination_probabilities: dict[str, float]
-            probability that a fragment is a chain terminal, which means
-            all descriptors will be removed from that fragment and
-            terminate chain growth. No additional terminal residue is
-            specified; this should be used for coarse-grained polymers
-            without specific end-group.
+        branch_term_probs: dict[str, float]
+            probability that a branched fragment is a chain terminal;
+            if terminal_probabilities are given
+        bond_term_probs: dict[str, float]
+            probability that a certain bonding descriptor connection
+            is present at the terminal
         start: str
             fragment name of the fragment to start with
         all_atom: bool
             if the fragments are all-atom resolution
         seed: int
             set random seed for all processes; default is None
         """
+        # first initalize the random number generator
+        random.seed(a=seed)
         self.fragment_dict = fragment_dict
         self.bonding_probabilities = bonding_probabilities
-        self.termination_probabilities = termination_probabilities
+        self.branch_term_probs = branch_term_probs
+        self.bond_term_probs = bond_term_probs
         self.all_atom = all_atom
-        self.target_weight = target_weight
-        self.start = start
-        self.current_open_bonds = defaultdict(list)
-        self.current_weight = 0
 
         # we need to make sure that we have the molecular
         # masses so we can compute the target weight
         self.fragments_by_bonding = defaultdict(list)
+        self.terminals_by_bonding = defaultdict(list)
         if fragment_masses:
             guess_mass_from_PTE = False
             self.fragment_masses = fragment_masses
@@ -85,9 +86,12 @@ def __init__(self,
             bondings = nx.get_node_attributes(fraggraph, "bonding")
             for node, bondings in bondings.items():
                 for bonding in bondings:
-                    self.fragments_by_bonding[bonding].append((fragname, node))
+                    if fragname in terminal_fragments:
+                        self.terminals_by_bonding[bonding].append((fragname, node))
+                    else:
+                        self.fragments_by_bonding[bonding].append((fragname, node))
 
-    def grow_chain(self, molecule, seed=None):
+    def add_fragment(self, molecule, open_bonds, fragments, bonding_probabilities):
         """
         Pick an open bonding descriptor according to `bonding_probabilities`
         and then pick a fragment that has the complementory bonding descriptor.
@@ -96,6 +100,13 @@ def grow_chain(self, molecule, seed=None):
         ----------
         molecule: nx.Graph
             the molecule to extend
+        open_bonds: dict[list[abc.hashable]]
+            a dict of bonding active descriptors with list of nodes
+            in molecule as value
+        fragments: dict[list[str]]
+            a dict of fragment names indexed by their bonding descriptors
+        bonding_probabilities:
+            the porbabilities that bonding connector forms a bond
 
         Returns
         -------
@@ -106,30 +117,58 @@ def grow_chain(self, molecule, seed=None):
         """
         # 1. get the probabilties of any bonding descriptor on the chain to
         #    form the new bond
-        probs = np.array([self.bonding_probabilities[bond_type[:-1]] for bond_type in self.current_open_bonds])
+        probs = np.array([bonding_probabilities[bond_type[:-1]] for bond_type in open_bonds])
         probs = probs / sum(probs)
         # 2. pick a random bonding descriptor according to these probs
-        bonding = np.random.choice(list(self.current_open_bonds.keys()), p=probs)
+        bonding = random.choices(list(open_bonds.keys()), weights=probs)[0]
         # 3. get a corresponding node; it may be that one descriptor is found on
         #    several nodes
-        random.seed(a=seed)
-        source_node = random.choice(self.current_open_bonds[bonding])
+        source_node = random.choice(open_bonds[bonding])
         # 4. get the complementary matching bonding descriptor
         compl_bonding = find_complementary_bonding_descriptor(bonding)
         # 5. pick a new fragment that has such bonding descriptor
-        random.seed(a=seed)
-        fragname, target_node = random.choice(self.fragments_by_bonding[compl_bonding])
+        fragname, target_node = random.choice(fragments[compl_bonding])
         # 6. add the new fragment and do some book-keeping
         correspondence = merge_graphs(molecule, self.fragment_dict[fragname])
         molecule.add_edge(source_node,
                           correspondence[target_node],
-                          bonding=(bonding, compl_bonding))
+                          bonding=(bonding, compl_bonding),
+                          order = int(bonding[-1]))
         molecule.nodes[source_node]['bonding'].remove(bonding)
         molecule.nodes[correspondence[target_node]]['bonding'].remove(compl_bonding)
-        self.current_open_bonds = find_open_bonds(molecule)
         return molecule, fragname
 
-    def terminate_branch(self, molecule, fragname, fragid, seed=None):
+    def terminate_fragment(self, molecule, fragid):
+        """
+        If bonding probabilities for terminal residues are given
+        select one terminal to add to the given fragment. If no
+        terminal bonding probabilities are defined the active bonding
+        descriptors of all nodes will be removed.
+
+        Parameters
+        ----------
+        molecule: nx.Graph
+            the molecule graph
+        fragid: int
+            the id of the fragment
+        """
+        target_nodes = [node for node in molecule.nodes if fragid in molecule.nodes[node]['fragid']]
+        open_bonds =  find_open_bonds(molecule, target_nodes=target_nodes)
+        # if terminal fragment bonding probabilties are given; add them here
+        if self.bond_term_probs:
+            self.add_fragment(molecule,
+                              open_bonds,
+                              self.terminals_by_bonding,
+                              self.bond_term_probs)
+            fragid += 1
+
+        for node in target_nodes:
+            if 'bonding' in molecule.nodes[node]:
+                del molecule.nodes[node]['bonding']
+
+        return fragid
+
+    def terminate_branch(self, molecule, fragname, fragid):
         """
         Probabilistically terminate a branch by removing all
         bonding descriptors from the last fragment.
@@ -147,58 +186,59 @@ def terminate_branch(self, molecule, fragname, fragid, seed=None):
         -------
         nx.Graph
         """
-        term_prob = self.termination_probabilities.get(fragname, 0)
-        random.seed(a=seed)
+        term_prob = self.branch_term_probs.get(fragname, -1)
         # probability check for termination
-        if random.random() < term_prob:
+        if random.random() <= term_prob:
             # check if there are more open bonding descriptors
             # if the number is the same as would get removed
             # then we are not on a branch
             active_bonds = nx.get_node_attributes(molecule, 'bonding')
-            target_nodes = [ node for node in active_bonds if molecule.nodes[node]['fragid'] == fragid]
+            target_nodes = [node for node in active_bonds if fragid in molecule.nodes[node]['fragid']]
             if len(target_nodes) < len(active_bonds):
-                for node in target_nodes:
-                    del molecule.nodes[node]['bonding']
-                self.current_open_bonds = find_open_bonds(molecule)
-        return molecule
+                fragid  = self.terminate_fragment(molecule, fragid)
+        return molecule, fragid
 
-    def sample(self, target_weight, seed=None):
+    def sample(self, target_weight, start_fragment=None):
         """
         From a list of cgsmiles fragment graphs generate a new random molecule
         according by stitching them together.
 
         Parameters
         ----------
-        target_weight
+        target_weight: int
             the weight of the polymer to generate
+        start_fragment: str
+            the fragment name to start with
 
         Returns
         -------
         nx.Graph
             the graph of the molecule
         """
         molecule = nx.Graph()
-        if self.start:
-            fragment = self.fragment_dict[self.start]
+        if start_fragment:
+            fragment = self.fragment_dict[start_fragment]
         else:
             # intialize the molecule; all fragements have the same probability
-            random.seed(a=seed)
             fragname = random.choice(list(self.fragment_dict.keys()))
             fragment = self.fragment_dict[fragname]
 
         merge_graphs(molecule, fragment)
-        self.current_open_bonds = find_open_bonds(molecule)
+        open_bonds = find_open_bonds(molecule)
 
         current_weight = 0
 
         # next we add monomers one after the other
         fragid = 1
         while current_weight < target_weight:
-            molecule, fragname = self.grow_chain(molecule, seed=seed)
-            if self.termination_probabilities:
-                molecule = self.terminate_branch(molecule, fragname, fragid, seed=seed)
-            fragid += 1
+            open_bonds = find_open_bonds(molecule)
+            molecule, fragname = self.add_fragment(molecule,
+                                                   open_bonds,
+                                                   self.fragments_by_bonding,
+                                                   self.bonding_probabilities)
+            molecule, fragid = self.terminate_branch(molecule, fragname, fragid)
             current_weight += self.fragment_masses[fragname]
+            fragid += 1
 
         if self.all_atom:
             rebuild_h_atoms(molecule)
@@ -208,8 +248,8 @@ def sample(self, target_weight, seed=None):
 
         # in all-atom MD there are common naming conventions
         # that might be expected and hence we set them here
-#        if self.all_atom:
-#            set_atom_names_atomistic(self.molecule)
+        if self.all_atom:
+            set_atom_names_atomistic(molecule)
 
         return molecule
 
@@ -239,7 +279,6 @@ def from_fragment_string(cls,
         all_atom = kwargs.get('all_atom', True)
         fragment_dict = read_fragments(fragment_strings[0],
                                        all_atom=all_atom)
-
         sampler = cls(fragment_dict,
                       **kwargs)