-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #4 from gruenewald-lab/write_cgsmiles
[5] Writer
- Loading branch information
Showing
5 changed files
with
322 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
import pytest | ||
import networkx as nx | ||
from pysmiles.testhelper import assertEqualGraphs | ||
from cgsmiles.read_fragments import read_fragments | ||
from cgsmiles.read_cgsmiles import read_cgsmiles | ||
from cgsmiles.write_cgsmiles import (write_cgsmiles_fragments, | ||
write_cgsmiles_graph, | ||
write_cgsmiles) | ||
from cgsmiles import MoleculeResolver | ||
|
||
@pytest.mark.parametrize('input_string',( | ||
# smiple linear seqeunce | ||
"{#PEO=[$]COC[$],#OHter=[$]O}", | ||
# two bonding IDs | ||
"{#PEO=[$][$A]COC[$][$B],#OHter=[$]O}", | ||
# something with bond order | ||
"{#PEO=[$]=COC[$A],#OHter=[$A]O,#PI=[$]=C}", | ||
# something with a shash operator | ||
"{#TC5=[!]CCC[!],#TN6a=[!]CNC[!]}", | ||
# something with aromatic fragments | ||
"{#TC5=[!]ccc[!],#TN6a=[!]cnc[!]}", | ||
)) | ||
def test_write_fragments(input_string): | ||
frag_dict = read_fragments(input_string) | ||
out_string = write_cgsmiles_fragments(frag_dict, smiles_format=True) | ||
frag_dict_out = read_fragments(out_string) | ||
assert set(frag_dict_out) == set(frag_dict) | ||
for fragname in frag_dict: | ||
assertEqualGraphs(frag_dict_out[fragname], frag_dict[fragname]) | ||
|
||
@pytest.mark.parametrize('input_string',( | ||
# smiple linear seqeunce | ||
"{[#PEO][#PMA]}", | ||
# ring | ||
"{[#TC5]1[#TC5][#TC5][#TC5][#TC5]1}", | ||
# branched | ||
"{[#PE][#PMA]([#PEO][#PEO][#PEO])[#PE]}", | ||
# branched nested | ||
"{[#PE][#PMA]([#PEO][#PEO]([#OMA][#OMA]1[#OMA][#OMA]1))[#PE]}", | ||
# special cycle | ||
"{[#PE]1[#PMA]1}", | ||
# special triple cycle | ||
"{[#A]12[#B]12}", | ||
)) | ||
def test_write_mol_graphs(input_string): | ||
mol_graph = read_cgsmiles(input_string) | ||
out_string = write_cgsmiles_graph(mol_graph) | ||
out_graph = read_cgsmiles(out_string) | ||
assertEqualGraphs(mol_graph, out_graph) | ||
|
||
@pytest.mark.parametrize('input_string',( | ||
# smiple linear seqeunce | ||
"{[#PEO][#PMMA][#PEO][#PMMA]}.{#PEO=[>]COC[<],#PMMA=[>]CC(C)[<]C(=O)OC}", | ||
# something with ring | ||
"{[#TC5]1[#TC5][#TC5]1}.{#TC5=[$]cc[$]}",)) | ||
def test_write_cgsmiles(input_string): | ||
resolver = MoleculeResolver.from_string(input_string) | ||
fragment_dicts = resolver.fragment_dicts | ||
molecule = resolver.molecule | ||
output_string = write_cgsmiles(molecule, fragment_dicts) | ||
out_resolver = MoleculeResolver.from_string(output_string) | ||
out_mol = out_resolver.molecule | ||
assertEqualGraphs(molecule, out_mol) | ||
out_fragments = out_resolver.fragment_dicts | ||
assert len(fragment_dicts) == len(out_fragments) | ||
for frag_dict, frag_dict_out in zip(fragment_dicts, out_fragments): | ||
assert set(frag_dict_out) == set(frag_dict) | ||
for fragname in frag_dict: | ||
# we cannot be sure that the atomnames are the same because they | ||
# will depend on the order | ||
nx.set_node_attributes(frag_dict_out[fragname], None, "atomname") | ||
nx.set_node_attributes(frag_dict[fragname], None, "atomname") | ||
assertEqualGraphs(frag_dict_out[fragname], frag_dict[fragname]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,242 @@ | ||
import logging | ||
from collections import defaultdict | ||
import networkx as nx | ||
from pysmiles.smiles_helper import format_atom | ||
from pysmiles.write_smiles import _get_ring_marker,_write_edge_symbol | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
order_to_symbol = {0: '.', 1: '-', 1.5: ':', 2: '=', 3: '#', 4: '$'} | ||
|
||
def format_node(molecule, current): | ||
""" | ||
Format a node from a `molecule` graph according to | ||
the CGSmiles syntax. The attribute fragname has to | ||
be set for the `current` node. | ||
Parameters | ||
---------- | ||
molecule: nx.Graph | ||
current: abc.hashbale | ||
Returns | ||
------- | ||
str | ||
the formatted string | ||
""" | ||
node = "[#{}]".format(molecule.nodes[current]['fragname']) | ||
return node | ||
|
||
def format_bonding(bonding): | ||
""" | ||
Given the list of bonding descriptors format them | ||
such that they can be added after a node/atom. This | ||
function wraps the descriptor in [ ] braces and makes | ||
sure that the bond order annotation is removed. | ||
Parameters | ||
---------- | ||
bonding: list[str] | ||
list of bonding descriptors | ||
Returns | ||
------- | ||
str | ||
the formatted bonding descriptor string | ||
""" | ||
bond_str = "" | ||
for bonding_descrpt in bonding: | ||
bond_order = bonding_descrpt[-1] | ||
order_symb = order_to_symbol[int(bond_order)] | ||
if order_symb != '-': | ||
bond_str = order_symb | ||
bond_str += "["+str(bonding_descrpt[:-1])+"]" | ||
return bond_str | ||
|
||
def write_graph(molecule, smiles_format=False, default_element='*'): | ||
""" | ||
Creates a CGsmiles string describing `molecule`. | ||
`molecule` should be a single connected component. | ||
Parameters | ||
---------- | ||
molecule : nx.Graph | ||
The molecule for which a CGsmiles string should be generated. | ||
smiles_format: | ||
If the nodes are written using the OpenSmiles standard format. | ||
Returns | ||
------- | ||
str | ||
The CGSmiles string describing `molecule`. | ||
""" | ||
start = min(molecule) | ||
dfs_successors = nx.dfs_successors(molecule, source=start) | ||
|
||
predecessors = defaultdict(list) | ||
for node_key, successors in dfs_successors.items(): | ||
for successor in successors: | ||
predecessors[successor].append(node_key) | ||
predecessors = dict(predecessors) | ||
# We need to figure out which edges we won't cross when doing the dfs. | ||
# These are the edges we'll need to add to the smiles using ring markers. | ||
edges = set() | ||
for n_idx, n_jdxs in dfs_successors.items(): | ||
for n_jdx in n_jdxs: | ||
edges.add(frozenset((n_idx, n_jdx))) | ||
total_edges = set(map(frozenset, molecule.edges)) | ||
ring_edges = list(total_edges - edges) | ||
# in cgsmiles graphs only bonds of order 1 and 2 | ||
# exists; order 2 means we have a ring at the | ||
# higher resolution. These orders are therefore | ||
# represented as rings and that requires to | ||
# add them to the ring list | ||
if not smiles_format: | ||
for edge in molecule.edges: | ||
if molecule.edges[edge]['order'] != 1: | ||
for n in range(1, molecule.edges[edge]['order']): | ||
ring_edges.append(frozenset(edge)) | ||
|
||
atom_to_ring_idx = defaultdict(list) | ||
ring_idx_to_bond = {} | ||
ring_idx_to_marker = {} | ||
for ring_idx, (n_idx, n_jdx) in enumerate(ring_edges, 1): | ||
atom_to_ring_idx[n_idx].append(ring_idx) | ||
atom_to_ring_idx[n_jdx].append(ring_idx) | ||
ring_idx_to_bond[ring_idx] = (n_idx, n_jdx) | ||
|
||
branch_depth = 0 | ||
branches = set() | ||
to_visit = [start] | ||
smiles = '' | ||
|
||
while to_visit: | ||
current = to_visit.pop() | ||
if current in branches: | ||
branch_depth += 1 | ||
smiles += '(' | ||
branches.remove(current) | ||
|
||
if current in predecessors: | ||
# It's not the first atom we're visiting, so we want to see if the | ||
# edge we last crossed to get here is interesting. | ||
previous = predecessors[current] | ||
assert len(previous) == 1 | ||
previous = previous[0] | ||
if smiles_format and _write_edge_symbol(molecule, previous, current): | ||
order = molecule.edges[previous, current].get('order', 1) | ||
smiles += order_to_symbol[order] | ||
|
||
if smiles_format: | ||
smiles += format_atom(molecule, current, default_element) | ||
else: | ||
smiles += format_node(molecule, current) | ||
|
||
# we add the bonding descriptors if there are any | ||
if molecule.nodes[current].get('bonding', False): | ||
smiles += format_bonding(molecule.nodes[current]['bonding']) | ||
|
||
if current in atom_to_ring_idx: | ||
# We're going to need to write a ring number | ||
ring_idxs = atom_to_ring_idx[current] | ||
for ring_idx in ring_idxs: | ||
ring_bond = ring_idx_to_bond[ring_idx] | ||
if ring_idx not in ring_idx_to_marker: | ||
marker = _get_ring_marker(ring_idx_to_marker.values()) | ||
ring_idx_to_marker[ring_idx] = marker | ||
new_marker = True | ||
else: | ||
marker = ring_idx_to_marker.pop(ring_idx) | ||
new_marker = False | ||
|
||
if smiles_format and _write_edge_symbol(molecule, *ring_bond) and new_marker: | ||
order = molecule.edges[ring_bond].get('order', 1) | ||
smiles += order_to_symbol[order] | ||
|
||
smiles += str(marker) if marker < 10 else '%{}'.format(marker) | ||
|
||
if current in dfs_successors: | ||
# Proceed to the next node in this branch | ||
next_nodes = dfs_successors[current] | ||
# ... and if needed, remember to return here later | ||
branches.update(next_nodes[1:]) | ||
to_visit.extend(next_nodes) | ||
elif branch_depth: | ||
# We're finished with this branch. | ||
smiles += ')' | ||
branch_depth -= 1 | ||
|
||
smiles += ')' * branch_depth | ||
return smiles | ||
|
||
def write_cgsmiles_graph(molecule): | ||
""" | ||
Write a CGSmiles graph sans fragments at | ||
different resolution. | ||
Parameters | ||
---------- | ||
molecule: nx.Graph | ||
a molecule where each node as a fragname attribute | ||
that is used as name in the CGSmiles string. | ||
Returns | ||
------- | ||
str | ||
the CGSmiles string | ||
""" | ||
|
||
cgsmiles_str = write_graph(molecule) | ||
return "{" + cgsmiles_str + "}" | ||
|
||
def write_cgsmiles_fragments(fragment_dict, smiles_format=True): | ||
""" | ||
Write fragments of molecule graph. To identify the fragments | ||
all nodes with the same `fragname` and `fragid` attributes | ||
are considered as fragment. Bonding between fragments is | ||
extracted from the `bonding` edge attributes. | ||
Parameters | ||
---------- | ||
fragment_dict: dict[str, nx.Graph] | ||
a dict of fragment graphs | ||
smiles_format: bool | ||
write all atom SMILES if True (default) otherwise | ||
write CGSmiles | ||
Returns | ||
------- | ||
str | ||
""" | ||
fragment_str = "" | ||
for fragname, frag_graph in fragment_dict.items(): | ||
fragment_str += f"#{fragname}=" | ||
# format graph depending on resolution | ||
fragment_str += write_graph(frag_graph, smiles_format=smiles_format) + "," | ||
fragment_str = "{" + fragment_str[:-1] + "}" | ||
return fragment_str | ||
|
||
def write_cgsmiles(molecule_graph, fragments, last_all_atom=True): | ||
""" | ||
Write a CGSmiles string given a low resolution molecule graph | ||
and any number of higher resolutions provided as fragment dicts. | ||
Parameters | ||
---------- | ||
molecule_graph: nx.Graph | ||
fragments: list[dict[nx.Graph]] | ||
a list of fragment dicts | ||
last_all_atom: bool | ||
if the last set of fragments is at the all_atom level | ||
Returns | ||
------- | ||
str | ||
CGSmiles string | ||
""" | ||
final_str = write_cgsmiles_graph(molecule_graph) | ||
for layer, fragment in enumerate(fragments): | ||
all_atom = (layer == len(fragments)-1) and last_all_atom | ||
fragment_str = write_cgsmiles_fragments(fragment, smiles_format=all_atom) | ||
final_str += "." + fragment_str | ||
return final_str |