gruenewald-lab · fgrunewald · May 6, 2024 · Apr 12, 2024 · Apr 12, 2024 · Apr 14, 2024
diff --git a/cgsmiles/graph_utils.py b/cgsmiles/graph_utils.py
@@ -37,13 +37,13 @@ def merge_graphs(source_graph, target_graph, max_node=None):
         # We assume that the last id is always the largest.
         last_node_idx = max_node
         offset = last_node_idx
-        fragment_offset = source_graph.nodes[last_node_idx].get('fragid', 0)
+        fragment_offset = max(source_graph.nodes[last_node_idx].get('fragid', [0])) + 1
 
     correspondence = {}
     for idx, node in enumerate(target_graph.nodes(), start=offset + 1):
         correspondence[node] = idx
         new_atom = copy.copy(target_graph.nodes[node])
-        new_atom['fragid'] = (new_atom.get('fragid', 0) + fragment_offset)
+        new_atom['fragid'] = [(new_atom.get('fragid', 0) + fragment_offset)]
         source_graph.add_node(idx, **new_atom)
 
     for node1, node2 in target_graph.edges:
@@ -53,31 +53,27 @@ def merge_graphs(source_graph, target_graph, max_node=None):
 
     return correspondence
 
-def sort_nodes(graph, sortby_attrs=("fragid", "atomid"), target_attr=None):
+def sort_nodes_by_attr(graph, sort_attr="fragid"):
     """
-    Sort nodes in graph by multiple attributes.
+    Sort nodes in graph by attribute and relable the graph in place.
 
     Parameters
     ----------
     graph: :class:`nx.Graph`
         the graph to sort nodes of
-    sortby_attrs: tuple(str)
-        the attributes and in which order to sort
-    target_attr: `abc.hashable`
-        if not None indices are assigned to this
-        attribute starting at 1
+    sort_attr: `abc.hashable`
+        the attribute to use for sorting
 
     Returns
     -------
     nx.Graph
         graph with nodes sorted in correct order
     """
-    node_order = sorted(graph, key=partial(_keyfunc, graph, attrs=sortby_attrs))
-    for new_idx, node_key in enumerate(node_order, 1):
-#        graph._node.move_to_end(node_key)
-        if target_attr is not None:
-            graph.nodes[node_key][target_attr] = new_idx
-    return graph
+    attr_values = nx.get_node_attributes(graph, sort_attr)
+    sorted_ids = sorted(attr_values, key=lambda item: (attr_values[item], item))
+    mapping = {old: new for new, old in enumerate(sorted_ids)}
+    new_graph = nx.relabel_nodes(graph, mapping, copy=True)
+    return new_graph
 
 def _keyfunc(graph, node_idx, attrs):
     """
@@ -95,23 +91,26 @@ def annotate_fragments(meta_graph, molecule):
     """
     node_to_fragids = nx.get_node_attributes(molecule, 'fragid')
 
-    fragids = defaultdict(list)
-    for fragid, node in node_to_fragids.items():
-        fragids[fragid].append(node)
+    fragid_to_node = defaultdict(list)
+    for node, fragids in node_to_fragids.items():
+        for fragid in fragids:
+            fragid_to_node[fragid].append(node)
 
     for meta_node in meta_graph.nodes:
         # adding node to the fragment graph
         graph_frag = nx.Graph()
-        for node in fragids[meta_node]:
+        for node in fragid_to_node[meta_node]:
             attrs = molecule.nodes[node]
             graph_frag.add_node(node, **attrs)
 
         # adding the edges
         # this is slow but OK; we always assume that the fragment
         # is much much smaller than the fullblown graph
-        combinations = itertools.combinations(fragids[meta_node], r=2)
+        combinations = itertools.combinations(fragid_to_node[meta_node], r=2)
         for a, b in combinations:
             if molecule.has_edge(a, b):
-                graph.add_edge(a, b)
+                graph_frag.add_edge(a, b)
+
+        meta_graph.nodes[meta_node]['graph'] = graph_frag
 
     return meta_graph
diff --git a/cgsmiles/pysmiles_utils.py b/cgsmiles/pysmiles_utils.py
@@ -0,0 +1,58 @@
+import pysmiles
+
+VALENCES = pysmiles.smiles_helper.VALENCES
+VALENCES.update({"H": (1,)})
+
+def rebuild_h_atoms(mol_graph, keep_bonding=False):
+    """
+    Helper function which add hydrogen atoms to the molecule graph.
+
+    First the hcount attribute produced by pysmiles is updated, because
+    fragments have no bonds at time of reading so pysmiles does not
+    know the connectivity. Hence the hcount is redone based on the
+    actual connectivity of the final molecule.
+
+    This function also makes sure to tag single hydrogen fragments,
+    in order to not merge them with adjecent fragments. Otherwise,
+    explicit single hydrogen residues would not be possible.
+
+    The molecule graph is updated in place with the hydrogen atoms
+    that are missing.
+
+    Using the keep_bonding argument the hydrogen count is reduced
+    by the number of bonding descriptors. In this way hydrogen
+    atoms can also be added to fragments only.
+
+    Parameters
+    ----------
+    mol_graph: :class:`nx.Graph`
+        graph describing the full molecule without hydrogen atoms
+    """
+    for node in mol_graph.nodes:
+        if mol_graph.nodes[node].get('bonding', False):
+            # get the degree
+            ele = mol_graph.nodes[node]['element']
+            # hcount is the valance minus the degree minus
+            # the number of bonding descriptors
+            bonds = round(sum([mol_graph.edges[(node, neigh)]['order'] for neigh in\
+                               mol_graph.neighbors(node)]))
+            charge = mol_graph.nodes[node].get('charge', 0)
+            hcount = pysmiles.smiles_helper._valence(mol_graph, node, minimum=0) -\
+                     bonds +\
+                     charge
+            # in this case we only rebuild hydrogen atoms that are not
+            # replaced by bonding operators.
+            if keep_bonding:
+                hcount -= len(mol_graph.nodes[node]['bonding'])
+
+            mol_graph.nodes[node]['hcount'] = hcount
+            if ele == "H":
+                mol_graph.nodes[node]['single_h_frag'] = True
+
+    pysmiles.smiles_helper.add_explicit_hydrogens(mol_graph)
+    for node in mol_graph.nodes:
+        if mol_graph.nodes[node].get("element", "*") == "H" and\
+        not mol_graph.nodes[node].get("single_h_frag", False):
+            ref_node = next(mol_graph.neighbors(node))
+            mol_graph.nodes[node]["fragid"] = mol_graph.nodes[ref_node]["fragid"]
+            mol_graph.nodes[node]["fragname"] = mol_graph.nodes[ref_node]["fragname"]
diff --git a/cgsmiles/read_fragments.py b/cgsmiles/read_fragments.py
@@ -6,6 +6,35 @@
 import pysmiles
 from .read_cgsmiles import read_cgsmiles
 
+class PeekIter(object):
+    """
+    Custom iter that allows looking ahead, without
+    advancing the actual iter.
+    """
+    def __init__(self, collection):
+        self.collection = collection
+        self.index = 0
+
+    def __next__(self):
+        try:
+            result = self.collection[self.index]
+            self.index += 1
+        except IndexError:
+            raise StopIteration
+        return result
+
+    def peek(self):
+        try:
+            result = self.collection[self.index]
+        except IndexError:
+            return ""
+        return result
+
+    def __iter__(self):
+        self.index = 0
+        return self
+
+
 def strip_bonding_descriptors(fragment_string):
     """
     Processes a CGBigSmile fragment string by
@@ -27,21 +56,30 @@ def strip_bonding_descriptors(fragment_string):
         a dict mapping bonding descriptors
         to the nodes within the string
     """
-    smile_iter = iter(fragment_string)
+    bond_to_order = {'-': 1, '=': 2, '#': 3, '$': 4, ':': 1.5, '.': 0}
+    smile_iter = PeekIter(fragment_string)
     bonding_descrpt = defaultdict(list)
     smile = ""
     node_count = 0
     prev_node = 0
+    current_order = None
     for token in smile_iter:
         if token == '[':
             peek = next(smile_iter)
-            if peek in ['$', '>', '<']:
+            if peek in ['$', '>', '<', '!']:
                 bond_descrp = peek
                 peek = next(smile_iter)
                 while peek != ']':
                     bond_descrp += peek
                     peek = next(smile_iter)
-                bonding_descrpt[prev_node].append(bond_descrp)
+                if smile_iter.peek() in bond_to_order:
+                    order = bond_to_order[next(smile_iter)]
+                elif current_order:
+                    order = current_order
+                    current_order = None
+                else:
+                    order = 1
+                bonding_descrpt[prev_node].append(bond_descrp + str(order))
             else:
                 atom = token
                 while peek != ']':
@@ -59,47 +97,16 @@ def strip_bonding_descriptors(fragment_string):
         elif token == ')':
             prev_node = anchor
             smile += token
+        elif token in bond_to_order:
+            current_order = bond_to_order[token]
         else:
-            if token not in '] H @ . - = # $ : / \\ + - %'\
+            if token not in '] H @ $ / \\ + - %'\
                 and not token.isdigit():
                 prev_node = node_count
                 node_count += 1
             smile += token
     return smile, bonding_descrpt
 
-def _rebuild_h_atoms(mol_graph):
-    # special hack around to fix
-    # pysmiles bug for a single
-    # atom molecule; we assume that the
-    # hcount is just wrong and set it to
-    # the valance number minus bonds minus
-    # bonding connectors
-    if len(mol_graph.nodes) == 1:
-        ele = mol_graph.nodes[0]['element']
-        # for N and P we assume the regular valency
-        hcount = pysmiles.smiles_helper.VALENCES[ele][0]
-        if mol_graph.nodes[0].get('bonding', False):
-            hcount -= 1
-        mol_graph.nodes[0]['hcount'] = hcount
-    else:
-        for node in mol_graph.nodes:
-            if mol_graph.nodes[node].get('bonding', False):
-                # get the degree
-                ele = mol_graph.nodes[node]['element']
-                # hcount is the valance minus the degree minus
-                # the number of bonding descriptors
-                bonds = round(sum([mol_graph.edges[(node, neigh)]['order'] for neigh in\
-                                   mol_graph.neighbors(node)]))
-                charge = mol_graph.nodes[node].get('charge', 0)
-                hcount = pysmiles.smiles_helper.VALENCES[ele][0] -\
-                         bonds -\
-                         len(mol_graph.nodes[node]['bonding']) +\
-                         charge
-                mol_graph.nodes[node]['hcount'] = hcount
-
-    pysmiles.smiles_helper.add_explicit_hydrogens(mol_graph)
-    return mol_graph
-
 def fragment_iter(fragment_str, all_atom=True):
     """
     Iterates over fragments defined in a CGBigSmile string.
@@ -137,8 +144,6 @@ def fragment_iter(fragment_str, all_atom=True):
         elif all_atom:
             mol_graph = pysmiles.read_smiles(smile)
             nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding')
-            # we need to rebuild hydrogen atoms now
-            _rebuild_h_atoms(mol_graph)
         # we deal with a CG resolution graph
         else:
             mol_graph = read_cgsmiles(smile)