gruenewald-lab · fgrunewald · Nov 22, 2024 · Oct 14, 2024 · Oct 14, 2024 · Oct 14, 2024
diff --git a/cgsmiles/cgsmiles_utils.py b/cgsmiles/cgsmiles_utils.py
@@ -1,5 +1,6 @@
 from collections import defaultdict
 import networkx as nx
+from .read_cgsmiles import read_cgsmiles
 
 def find_complementary_bonding_descriptor(bonding_descriptor, ellegible_descriptors=None):
     """
@@ -64,3 +65,35 @@ def find_open_bonds(molecule, target_nodes=None):
             for bonding_types in bonding_types:
                 open_bonds_by_descriptor[bonding_types].append(node)
     return open_bonds_by_descriptor
+
+def read_fragment_cgsmiles(cgsmiles_str,
+                           fragname,
+                           bonding_descrpt={},
+                           attributes={}):
+    """
+    Read a smiles_str corresponding to a CGSmiles fragment and
+    annotate bonding descriptors, isomers, as well as any other
+    attributes.
+
+    Parameters
+    ----------
+    smiles_str: str
+        string in CGSmiles format
+    fragname: str
+        the name of the fragment
+    attributes: dict
+
+    Returns
+    -------
+    nx.Graph
+        the graph of the molecular fragment
+    """
+    mol_graph = read_cgsmiles(cgsmiles_str)
+    fragnames = nx.get_node_attributes(mol_graph, 'fragname')
+    nx.set_node_attributes(mol_graph, fragnames, 'atomname')
+    nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding')
+    nx.set_node_attributes(mol_graph, fragname, 'fragname')
+    nx.set_node_attributes(mol_graph, 0, 'fragid')
+    nx.set_node_attributes(mol_graph, 1, 'w')
+    nx.set_node_attributes(mol_graph, attributes)
+    return mol_graph
diff --git a/cgsmiles/dialects.py b/cgsmiles/dialects.py
@@ -0,0 +1,147 @@
+from inspect import signature, Signature, Parameter
+from functools import partial
+
+def check_and_cast_types(bound_args, signature):
+    for name, value in bound_args.arguments.items():
+        param = signature.parameters.get(name)
+        # Check if a type annotation is present
+        if param and param.annotation != Parameter.empty:
+            expected_type = param.annotation
+            # Attempt type casting if the value is not of the expected type
+            if not isinstance(value, expected_type):
+                try:
+                   bound_args.arguments[name] = expected_type(value)
+                except (TypeError, ValueError):
+                    raise TypeError(f"Argument '{name}' must be of type {expected_type.__name__}")
+    return bound_args
+
+def _parse_dialect_string(string_iterable,
+                          dialect_signature,
+                          arg_to_fullname={},
+                          annotation_sep_token=';',
+                          annotation_assign_token='='):
+    """
+    This base function parsers a string that describes key value pairs
+    in having a pattern of:
+
+    key<annotation_assign_token>value<annotation_sep_token>key ...
+
+    Default values, non-keyword agruments and types are defined using the
+    dialect signature object. If args are defined the key and assignment
+    token may be omitted.
+
+    Neither the `annotation_sep_token` nor the `annotation_assign_token`
+    can be part of key or value. A SyntaxError is raised in this case.
+
+    Parameters
+    ----------
+    string_iterable: iter
+        the string or iter object that contains the string
+    dialect_signature: cls.inspec.Signature
+        a signature defineing args, kwargs, default values
+        and types
+    arg_to_fullname: dict
+        maps arguments to more verbose descriptions
+    annotation_sep_token: str
+        character used to seperate key value pairs
+    annotation_assign_token: str
+        character used to assign a key from a value
+
+    Returns
+    -------
+    dict
+       dict of key value paris
+
+    Raises
+    ------
+    SyntaxError
+        an error is raised if the signature does not match or
+        too many annotation_assign_token are given
+    """
+    args_found = []
+    kwargs_found = {}
+    if len(string_iterable) > 0:
+        elements = string_iterable.split(annotation_sep_token)
+        for entry in elements:
+            if entry.count('=') > 1:
+                # this takes care of too many '=' chacaters
+                msg = (f"Your annotation {entry} contains too many "
+                       f"{annotation_assign_token} charachters. Only"
+                        "chacracter per key value pair is allowed")
+                raise SyntaxError(msg)
+            key_value = entry.split(annotation_assign_token)
+
+            if len(key_value) == 1:
+                args_found.append(key_value[0])
+            else:
+                kwargs_found[key_value[0]] = key_value[1]
+
+    try:
+        applied_labels = dialect_signature.bind(*args_found,
+                                                **kwargs_found)
+    except TypeError as emsg:
+        print(emsg)
+        msg = ("You have too many positional arguments or "
+               f"{annotation_sep_token} as part of key value "
+                "pairs which is not allowed.")
+        raise SyntaxError(msg)
+
+    applied_labels = check_and_cast_types(applied_labels,
+                                          dialect_signature)
+    applied_labels.apply_defaults()
+    # convert keys to more verbose names
+    # this should only apply to args know to
+    # the signature
+    for old_key, new_key in arg_to_fullname.items():
+        if old_key in applied_labels.arguments:
+            applied_labels.arguments[new_key] = applied_labels.arguments.pop(old_key)
+
+    # if there are kwargs we need to put them into
+    # output dict
+    out_args = {}
+    if 'kwargs' in applied_labels.arguments:
+        out_args.update(applied_labels.arguments['kwargs'])
+        del applied_labels.arguments['kwargs']
+    out_args.update(applied_labels.arguments)
+    return out_args
+
+def create_dialect(default_attributes, accept_kwargs=True):
+    """
+    Creates a signature of default annotations.
+    Note that the order of the entries in the dict
+    determines the order of the args accepted.
+    """
+    parameters = []
+    for argname, default_value in default_attributes.items():
+        arg_type = type(default_value)
+        parameters.append(Parameter(argname,
+                                    Parameter.POSITIONAL_OR_KEYWORD,
+                                    default=default_value,
+                                    annotation=arg_type))
+    if accept_kwargs:
+        parameters.append(Parameter('kwargs',
+                                    kind=Parameter.VAR_KEYWORD))
+    sig = Signature(parameters)
+    return sig
+
+##########################################################
+#                   KNOWN DIALECTS                       #
+##########################################################
+# this one is for global use
+# it is the base CGSmiles dialect
+CGSMILES_DEFAULT_DIALECT = create_dialect({"fragname": "NaN",
+                                           "q": 0.0,
+                                           "w": 1.0})
+parse_graph_base_node = partial(_parse_dialect_string,
+                                dialect_signature=CGSMILES_DEFAULT_DIALECT,
+                                arg_to_fullname = {"w": "weight", "q": "charge"})
+# this one is an internal fukery until the pysmiles
+# base parser is available
+# it just strips the kwargs from fragments before
+# they go to the respective parser
+# in case of cgsmiles fragments it is a bit doing
+# double the work
+fragment_base = create_dialect({"w": 1.0}, accept_kwargs=True)
+_fragment_node_parser = partial(_parse_dialect_string,
+                                dialect_signature=fragment_base,
+                                arg_to_fullname = {"w": "weight"})
diff --git a/cgsmiles/graph_utils.py b/cgsmiles/graph_utils.py
@@ -146,7 +146,6 @@ def annotate_fragments(meta_graph, molecule):
 
     return meta_graph
 
-
 def set_atom_names_atomistic(molecule, meta_graph=None):
     """
     Set atomnames according to commonly used convention

diff --git a/cgsmiles/pysmiles_utils.py b/cgsmiles/pysmiles_utils.py
@@ -32,7 +32,9 @@ def compute_mass(input_molecule):
         mass += pysmiles.PTE[element]['AtomicMass']
     return mass
 
-def rebuild_h_atoms(mol_graph, keep_bonding=False):
+def rebuild_h_atoms(mol_graph,
+                    keep_bonding=False,
+                    copy_attrs=['fragid', 'fragname', 'weight']):
     """
     Helper function which add hydrogen atoms to the molecule graph.
 
@@ -52,17 +54,22 @@ def rebuild_h_atoms(mol_graph, keep_bonding=False):
     by the number of bonding descriptors. In this way hydrogen
     atoms can also be added to fragments only.
 
+    The `copy_attrs` argument defines a list of attributes to copy
+    to the newly added hydrogen atoms. In case the hydrogen atoms
+    are their own fragments attributes are not copied. If an attribute
+    is already assigned, because the hydrogen atom was explicit that
+    attribute is not replaced.
+
     Parameters
     ----------
     mol_graph: :class:`nx.Graph`
         graph describing the full molecule without hydrogen atoms
+    copy_attrs: list[abc.hashable]
+        a list of attributes to copy from the parent node to the
+        hydrogen atom
+    keep_bonding: bool
+        adjust hcount for number of bonding descriptors
     """
-    for node in mol_graph.nodes:
-
-        if mol_graph.nodes[node].get('bonding', False) and  \
-            mol_graph.nodes[node].get('element', '*') == "H":
-            mol_graph.nodes[node]['single_h_frag'] = True
-
     try:
         pysmiles.smiles_helper.correct_aromatic_rings(mol_graph, strict=True)
     except SyntaxError as pysmiles_err:
@@ -89,14 +96,14 @@ def rebuild_h_atoms(mol_graph, keep_bonding=False):
     # now we add the hydrogen atoms
     pysmiles.smiles_helper.add_explicit_hydrogens(mol_graph)
 
-    # if we are having single hydrogen fragments we need to
-    # make sure the fragid and fragname is keept
-    for node in mol_graph.nodes:
-        if mol_graph.nodes[node].get("element", "*") == "H" and\
-        not mol_graph.nodes[node].get("single_h_frag", False):
-            ref_node = next(mol_graph.neighbors(node))
-            mol_graph.nodes[node]["fragid"] = mol_graph.nodes[ref_node]["fragid"]
-            mol_graph.nodes[node]["fragname"] = mol_graph.nodes[ref_node]["fragname"]
+    for node, element in mol_graph.nodes(data='element'):
+        if element == "H" and not mol_graph.nodes[node].get("single_h_frag", False):
+            anchor = next(mol_graph.neighbors(node))
+            for attr in copy_attrs:
+                if attr in mol_graph.nodes[node]:
+                    continue
+                value = mol_graph.nodes[anchor][attr]
+                mol_graph.nodes[node][attr] = value
 
 def annotate_ez_isomers(molecule):
     """
@@ -179,3 +186,97 @@ def mark_chiral_atoms(molecule):
             neighbours = [neighbours[0],  neighbours[1], neighbours[3], neighbours[2]]
 
         molecule.nodes[node]['rs_isomer'] = tuple(neighbours)
+
+def read_fragment_smiles(smiles_str,
+                         fragname,
+                         bonding_descrpt={},
+                         rs_isomers={},
+                         ez_isomers={},
+                         attributes={}):
+    """
+    Read a smiles_str corresponding to a CGSmiles fragment and
+    annotate bonding descriptors, isomers, as well as any other
+    attributes.
+
+    This function also sets default attributes as follows:
+
+    - fragname to `fragname`
+    - fragid to 0
+    - w to 1
+
+    Parameters
+    ----------
+    smiles_str: str
+        string in OpenSMILES format
+    fragname: str
+        the name of the fragment
+    rs_isomers: dict
+    ez_isomers: dict
+    attributes: dict
+
+    Returns
+    -------
+    nx.Graph
+        the graph of the molecular fragment
+    """
+    if smiles_str == 'H':
+        LOGGER.warning("You define an H fragment, which is not valid SMILES. We'll make it [H].")
+        smiles_str = '[H]'
+
+    mol_graph = pysmiles.read_smiles(smiles_str,
+                                     explicit_hydrogen=True,
+                                     reinterpret_aromatic=False,
+                                     strict=False)
+    # set some default values
+    nx.set_node_attributes(mol_graph, fragname, 'fragname')
+    nx.set_node_attributes(mol_graph, 0, 'fragid')
+    nx.set_node_attributes(mol_graph, 1, 'weight')
+
+    # we add all bonding descriptors to the molecule
+    nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding')
+
+    # set other attributes
+    nx.set_node_attributes(mol_graph, attributes)
+
+    # set the default atomnames consiting of the element and index
+    atomnames = {node[0]: node[1]['element']+str(node[0]) for node in mol_graph.nodes(data=True)}
+    nx.set_node_attributes(mol_graph, atomnames, 'atomname')
+
+    # we have just a single atom so no need for any annotations
+    if len(mol_graph) == 1:
+        # we set the hcount for all non-hydrogen elements
+        if mol_graph.nodes[0]['element'] != 'H':
+            mol_graph.nodes[0]['hcount'] = 0
+        # we tag all single h-atoms
+        else:
+            mol_graph.nodes[0]['single_h_frag'] = True
+        return mol_graph
+
+    # we need to remove hydrogen atoms except when they are having
+    # attributes; in this case we need to keep them
+    hatoms = set([n for n, e in mol_graph.nodes(data='element') if e == 'H'])
+    hatoms_to_keep = set(attributes.keys()) & hatoms
+
+    # temp fix until pysmiles util is imporved
+    # we set the element to z so they are ignored when pysmiles removes hatoms
+    nx.set_node_attributes(mol_graph,
+                           dict(zip(hatoms_to_keep, len(hatoms_to_keep)*'z')),
+                           'element')
+
+    pysmiles.remove_explicit_hydrogens(mol_graph)
+
+    # now we reset the hatoms
+    nx.set_node_attributes(mol_graph,
+                           dict(zip(hatoms_to_keep, len(hatoms_to_keep)*'H')),
+                           'element')
+
+    # annotate rs isomers
+    nx.set_node_attributes(mol_graph, rs_isomers, 'rs_isomer')
+
+    # we need to split countable node keys and the associated value
+    ez_isomer_atoms = {idx: val[:-1] for idx, val in ez_isomers.items()}
+    ez_isomer_class = {idx: val[-1] for idx, val in ez_isomers.items()}
+    nx.set_node_attributes(mol_graph, ez_isomer_atoms, 'ez_isomer_atoms')
+    nx.set_node_attributes(mol_graph, ez_isomer_class, 'ez_isomer_class')
+
+    return mol_graph
-Original file line number
+Diff line change
@@ Expand Up / @@ -146,7 +146,6 @@ def annotate_fragments(meta_graph, molecule): @@
         return meta_graph
     def set_atom_names_atomistic(molecule, meta_graph=None):
         """
         Set atomnames according to commonly used convention
@@ Expand Down @@