Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Annotations #25

Merged
merged 19 commits into from
Nov 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions cgsmiles/cgsmiles_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from collections import defaultdict
import networkx as nx
from .read_cgsmiles import read_cgsmiles

def find_complementary_bonding_descriptor(bonding_descriptor, ellegible_descriptors=None):
"""
Expand Down Expand Up @@ -64,3 +65,35 @@ def find_open_bonds(molecule, target_nodes=None):
for bonding_types in bonding_types:
open_bonds_by_descriptor[bonding_types].append(node)
return open_bonds_by_descriptor

def read_fragment_cgsmiles(cgsmiles_str,
fragname,
bonding_descrpt={},
attributes={}):
"""
Read a smiles_str corresponding to a CGSmiles fragment and
annotate bonding descriptors, isomers, as well as any other
attributes.

Parameters
----------
smiles_str: str
string in CGSmiles format
fragname: str
the name of the fragment
attributes: dict

Returns
-------
nx.Graph
the graph of the molecular fragment
"""
mol_graph = read_cgsmiles(cgsmiles_str)
fragnames = nx.get_node_attributes(mol_graph, 'fragname')
nx.set_node_attributes(mol_graph, fragnames, 'atomname')
nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding')
nx.set_node_attributes(mol_graph, fragname, 'fragname')
nx.set_node_attributes(mol_graph, 0, 'fragid')
nx.set_node_attributes(mol_graph, 1, 'w')
nx.set_node_attributes(mol_graph, attributes)
return mol_graph
147 changes: 147 additions & 0 deletions cgsmiles/dialects.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
from inspect import signature, Signature, Parameter
from functools import partial

def check_and_cast_types(bound_args, signature):
for name, value in bound_args.arguments.items():
param = signature.parameters.get(name)
# Check if a type annotation is present
if param and param.annotation != Parameter.empty:
expected_type = param.annotation
# Attempt type casting if the value is not of the expected type
if not isinstance(value, expected_type):
try:
bound_args.arguments[name] = expected_type(value)
except (TypeError, ValueError):
raise TypeError(f"Argument '{name}' must be of type {expected_type.__name__}")
return bound_args

def _parse_dialect_string(string_iterable,
dialect_signature,
arg_to_fullname={},
annotation_sep_token=';',
annotation_assign_token='='):
"""
This base function parsers a string that describes key value pairs
in having a pattern of:

key<annotation_assign_token>value<annotation_sep_token>key ...

Default values, non-keyword agruments and types are defined using the
dialect signature object. If args are defined the key and assignment
token may be omitted.

Neither the `annotation_sep_token` nor the `annotation_assign_token`
can be part of key or value. A SyntaxError is raised in this case.

Parameters
----------
string_iterable: iter
the string or iter object that contains the string
dialect_signature: cls.inspec.Signature
a signature defineing args, kwargs, default values
and types
arg_to_fullname: dict
maps arguments to more verbose descriptions
annotation_sep_token: str
character used to seperate key value pairs
annotation_assign_token: str
character used to assign a key from a value

Returns
-------
dict
dict of key value paris

Raises
------
SyntaxError
an error is raised if the signature does not match or
too many annotation_assign_token are given
"""
args_found = []
kwargs_found = {}
if len(string_iterable) > 0:
elements = string_iterable.split(annotation_sep_token)
for entry in elements:
if entry.count('=') > 1:
# this takes care of too many '=' chacaters
msg = (f"Your annotation {entry} contains too many "
f"{annotation_assign_token} charachters. Only"
"chacracter per key value pair is allowed")
raise SyntaxError(msg)
key_value = entry.split(annotation_assign_token)

if len(key_value) == 1:
args_found.append(key_value[0])
else:
kwargs_found[key_value[0]] = key_value[1]

try:
applied_labels = dialect_signature.bind(*args_found,
**kwargs_found)
except TypeError as emsg:
print(emsg)
msg = ("You have too many positional arguments or "
f"{annotation_sep_token} as part of key value "
"pairs which is not allowed.")
raise SyntaxError(msg)

applied_labels = check_and_cast_types(applied_labels,
dialect_signature)
applied_labels.apply_defaults()
# convert keys to more verbose names
# this should only apply to args know to
# the signature
for old_key, new_key in arg_to_fullname.items():
if old_key in applied_labels.arguments:
applied_labels.arguments[new_key] = applied_labels.arguments.pop(old_key)

# if there are kwargs we need to put them into
# output dict
out_args = {}
if 'kwargs' in applied_labels.arguments:
out_args.update(applied_labels.arguments['kwargs'])
del applied_labels.arguments['kwargs']
out_args.update(applied_labels.arguments)
return out_args

def create_dialect(default_attributes, accept_kwargs=True):
"""
Creates a signature of default annotations.
Note that the order of the entries in the dict
determines the order of the args accepted.
"""
parameters = []
for argname, default_value in default_attributes.items():
arg_type = type(default_value)
parameters.append(Parameter(argname,
Parameter.POSITIONAL_OR_KEYWORD,
default=default_value,
annotation=arg_type))
if accept_kwargs:
parameters.append(Parameter('kwargs',
kind=Parameter.VAR_KEYWORD))
sig = Signature(parameters)
return sig

##########################################################
# KNOWN DIALECTS #
##########################################################
# this one is for global use
# it is the base CGSmiles dialect
CGSMILES_DEFAULT_DIALECT = create_dialect({"fragname": "NaN",
"q": 0.0,
"w": 1.0})
parse_graph_base_node = partial(_parse_dialect_string,
dialect_signature=CGSMILES_DEFAULT_DIALECT,
arg_to_fullname = {"w": "weight", "q": "charge"})
# this one is an internal fukery until the pysmiles
# base parser is available
# it just strips the kwargs from fragments before
# they go to the respective parser
# in case of cgsmiles fragments it is a bit doing
# double the work
fragment_base = create_dialect({"w": 1.0}, accept_kwargs=True)
_fragment_node_parser = partial(_parse_dialect_string,
dialect_signature=fragment_base,
arg_to_fullname = {"w": "weight"})
1 change: 0 additions & 1 deletion cgsmiles/graph_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,6 @@ def annotate_fragments(meta_graph, molecule):

return meta_graph


def set_atom_names_atomistic(molecule, meta_graph=None):
"""
Set atomnames according to commonly used convention
Expand Down
131 changes: 116 additions & 15 deletions cgsmiles/pysmiles_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@ def compute_mass(input_molecule):
mass += pysmiles.PTE[element]['AtomicMass']
return mass

def rebuild_h_atoms(mol_graph, keep_bonding=False):
def rebuild_h_atoms(mol_graph,
keep_bonding=False,
copy_attrs=['fragid', 'fragname', 'weight']):
"""
Helper function which add hydrogen atoms to the molecule graph.

Expand All @@ -52,17 +54,22 @@ def rebuild_h_atoms(mol_graph, keep_bonding=False):
by the number of bonding descriptors. In this way hydrogen
atoms can also be added to fragments only.

The `copy_attrs` argument defines a list of attributes to copy
to the newly added hydrogen atoms. In case the hydrogen atoms
are their own fragments attributes are not copied. If an attribute
is already assigned, because the hydrogen atom was explicit that
attribute is not replaced.

Parameters
----------
mol_graph: :class:`nx.Graph`
graph describing the full molecule without hydrogen atoms
copy_attrs: list[abc.hashable]
a list of attributes to copy from the parent node to the
hydrogen atom
keep_bonding: bool
adjust hcount for number of bonding descriptors
"""
for node in mol_graph.nodes:

if mol_graph.nodes[node].get('bonding', False) and \
mol_graph.nodes[node].get('element', '*') == "H":
mol_graph.nodes[node]['single_h_frag'] = True

try:
pysmiles.smiles_helper.correct_aromatic_rings(mol_graph, strict=True)
except SyntaxError as pysmiles_err:
Expand All @@ -89,14 +96,14 @@ def rebuild_h_atoms(mol_graph, keep_bonding=False):
# now we add the hydrogen atoms
pysmiles.smiles_helper.add_explicit_hydrogens(mol_graph)

# if we are having single hydrogen fragments we need to
# make sure the fragid and fragname is keept
for node in mol_graph.nodes:
if mol_graph.nodes[node].get("element", "*") == "H" and\
not mol_graph.nodes[node].get("single_h_frag", False):
ref_node = next(mol_graph.neighbors(node))
mol_graph.nodes[node]["fragid"] = mol_graph.nodes[ref_node]["fragid"]
mol_graph.nodes[node]["fragname"] = mol_graph.nodes[ref_node]["fragname"]
for node, element in mol_graph.nodes(data='element'):
if element == "H" and not mol_graph.nodes[node].get("single_h_frag", False):
anchor = next(mol_graph.neighbors(node))
for attr in copy_attrs:
if attr in mol_graph.nodes[node]:
continue
value = mol_graph.nodes[anchor][attr]
mol_graph.nodes[node][attr] = value

def annotate_ez_isomers(molecule):
"""
Expand Down Expand Up @@ -179,3 +186,97 @@ def mark_chiral_atoms(molecule):
neighbours = [neighbours[0], neighbours[1], neighbours[3], neighbours[2]]

molecule.nodes[node]['rs_isomer'] = tuple(neighbours)

def read_fragment_smiles(smiles_str,
fragname,
bonding_descrpt={},
rs_isomers={},
ez_isomers={},
attributes={}):
"""
Read a smiles_str corresponding to a CGSmiles fragment and
annotate bonding descriptors, isomers, as well as any other
attributes.

This function also sets default attributes as follows:

- fragname to `fragname`
- fragid to 0
- w to 1

Parameters
----------
smiles_str: str
string in OpenSMILES format
fragname: str
the name of the fragment
rs_isomers: dict
ez_isomers: dict
attributes: dict

Returns
-------
nx.Graph
the graph of the molecular fragment
"""
if smiles_str == 'H':
LOGGER.warning("You define an H fragment, which is not valid SMILES. We'll make it [H].")
smiles_str = '[H]'

mol_graph = pysmiles.read_smiles(smiles_str,
explicit_hydrogen=True,
reinterpret_aromatic=False,
strict=False)
# set some default values
nx.set_node_attributes(mol_graph, fragname, 'fragname')
nx.set_node_attributes(mol_graph, 0, 'fragid')
nx.set_node_attributes(mol_graph, 1, 'weight')

# we add all bonding descriptors to the molecule
nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding')

# set other attributes
nx.set_node_attributes(mol_graph, attributes)

# set the default atomnames consiting of the element and index
atomnames = {node[0]: node[1]['element']+str(node[0]) for node in mol_graph.nodes(data=True)}
nx.set_node_attributes(mol_graph, atomnames, 'atomname')

# we have just a single atom so no need for any annotations
if len(mol_graph) == 1:
# we set the hcount for all non-hydrogen elements
if mol_graph.nodes[0]['element'] != 'H':
mol_graph.nodes[0]['hcount'] = 0
# we tag all single h-atoms
else:
mol_graph.nodes[0]['single_h_frag'] = True
return mol_graph

# we need to remove hydrogen atoms except when they are having
# attributes; in this case we need to keep them
hatoms = set([n for n, e in mol_graph.nodes(data='element') if e == 'H'])
hatoms_to_keep = set(attributes.keys()) & hatoms

# temp fix until pysmiles util is imporved
# we set the element to z so they are ignored when pysmiles removes hatoms
nx.set_node_attributes(mol_graph,
dict(zip(hatoms_to_keep, len(hatoms_to_keep)*'z')),
'element')

pysmiles.remove_explicit_hydrogens(mol_graph)

# now we reset the hatoms
nx.set_node_attributes(mol_graph,
dict(zip(hatoms_to_keep, len(hatoms_to_keep)*'H')),
'element')

# annotate rs isomers
nx.set_node_attributes(mol_graph, rs_isomers, 'rs_isomer')

# we need to split countable node keys and the associated value
ez_isomer_atoms = {idx: val[:-1] for idx, val in ez_isomers.items()}
ez_isomer_class = {idx: val[-1] for idx, val in ez_isomers.items()}
nx.set_node_attributes(mol_graph, ez_isomer_atoms, 'ez_isomer_atoms')
nx.set_node_attributes(mol_graph, ez_isomer_class, 'ez_isomer_class')

return mol_graph
Loading
Loading