-
Notifications
You must be signed in to change notification settings - Fork 2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[5] Writer #4
[5] Writer #4
Changes from 20 commits
41ac6ad
ac865ed
a51f186
5277c5d
7351354
91eafe2
7ba2ab5
85b1c72
ecfe5e1
e6d8586
93f717d
0968dec
aad9e4a
620f538
45f649e
f45a6ff
7c4a94d
db7bd7a
2d43d69
54ad72b
4ae7917
431b988
7370d54
116b5e9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
import pytest | ||
from pysmiles.testhelper import assertEqualGraphs | ||
from cgsmiles.read_fragments import read_fragments | ||
from cgsmiles.read_cgsmiles import read_cgsmiles | ||
from cgsmiles.write_cgsmiles import write_cgsmiles_fragments, write_cgsmiles_graph | ||
|
||
@pytest.mark.parametrize('input_string',( | ||
# smiple linear seqeunce | ||
"{#PEO=[$]COC[$],#OHter=[$]O}", | ||
# two bonding IDs | ||
"{#PEO=[$][$A]COC[$][$B],#OHter=[$]O}", | ||
# something with bond order | ||
"{#PEO=[$]=COC[$A],#OHter=[$A]O,#PI=[$]=C}", | ||
# something with a shash operator | ||
"{#TC5=[!]CCC[!],#TN6a=[!]CNC[!]}", | ||
# something with aromatic fragments | ||
"{#TC5=[!]ccc[!],#TN6a=[!]cnc[!]}", | ||
)) | ||
def test_write_fragments(input_string): | ||
frag_dict = read_fragments(input_string) | ||
out_string = write_cgsmiles_fragments(frag_dict, smiles_format=True) | ||
frag_dict_out = read_fragments(out_string) | ||
assert set(frag_dict_out) == set(frag_dict) | ||
for fragname in frag_dict: | ||
assertEqualGraphs(frag_dict_out[fragname], frag_dict[fragname]) | ||
|
||
@pytest.mark.parametrize('input_string',( | ||
# smiple linear seqeunce | ||
"{[#PEO][#PMA]}", | ||
# ring | ||
"{[#TC5]1[#TC5][#TC5][#TC5][#TC5]1}", | ||
# branched | ||
"{[#PE][#PMA]([#PEO][#PEO][#PEO])[#PE]}", | ||
# branched nested | ||
"{[#PE][#PMA]([#PEO][#PEO]([#OMA][#OMA]1[#OMA][#OMA]1))[#PE]}", | ||
# special cycle | ||
"{[#PE]1[#PMA]1}", | ||
# special triple cycle | ||
"{[#A]12[#B]12}", | ||
)) | ||
def test_write_mol_graphs(input_string): | ||
mol_graph = read_cgsmiles(input_string) | ||
out_string = write_cgsmiles_graph(mol_graph) | ||
out_graph = read_cgsmiles(out_string) | ||
print(out_string) | ||
assertEqualGraphs(mol_graph, out_graph) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,242 @@ | ||
import logging | ||
from collections import defaultdict | ||
import networkx as nx | ||
from pysmiles.smiles_helper import format_atom | ||
from pysmiles.write_smiles import _get_ring_marker,_write_edge_symbol | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
order_to_symbol = {0: '.', 1: '-', 1.5: ':', 2: '=', 3: '#', 4: '$'} | ||
|
||
def format_node(molecule, current): | ||
""" | ||
Format a node from a `molecule` graph according to | ||
the CGSmiles syntax. The attribute fragname has to | ||
be set for the `current` node. | ||
|
||
Parameters | ||
---------- | ||
molecule: nx.Graph | ||
current: abc.hashbale | ||
|
||
Returns | ||
------- | ||
str | ||
the formatted string | ||
""" | ||
node = "[#{}]".format(molecule.nodes[current]['fragname']) | ||
return node | ||
|
||
def format_bonding(bonding): | ||
""" | ||
Given the list of bonding descriptors format them | ||
such that they can be added after a node/atom. This | ||
function wraps the descriptor in [ ] braces and makes | ||
sure that the bond order annotation is removed. | ||
|
||
Parameters | ||
---------- | ||
bonding: list[str] | ||
list of bonding descriptors | ||
|
||
Returns | ||
------- | ||
str | ||
the formatted bonding descriptor string | ||
""" | ||
bond_str = "" | ||
for bonding_descrpt in bonding: | ||
bond_order = bonding_descrpt[-1] | ||
order_symb = order_to_symbol[int(bond_order)] | ||
if order_symb != '-': | ||
bond_str = order_symb | ||
bond_str += "["+str(bonding_descrpt[:-1])+"]" | ||
return bond_str | ||
|
||
def write_graph(molecule, smiles_format=False, default_element='*'): | ||
""" | ||
Creates a CGsmiles string describing `molecule`. | ||
`molecule` should be a single connected component. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Meh. Not too much work to find all connected components and call a (this) function on those is it? And then just join them with a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 😆 this one comes from the pysmiles parser I copied. It still has to be a connected component to be valid. That's simply a CGSmiles requirement. You can have edges with zero bond order though. But they have to be edges. |
||
|
||
Parameters | ||
---------- | ||
molecule : nx.Graph | ||
The molecule for which a CGsmiles string should be generated. | ||
smiles_format: | ||
If the nodes are written using the OpenSmiles standard format. | ||
|
||
Returns | ||
------- | ||
str | ||
The CGSmiles string describing `molecule`. | ||
""" | ||
start = min(molecule) | ||
dfs_successors = nx.dfs_successors(molecule, source=start) | ||
|
||
predecessors = defaultdict(list) | ||
for node_key, successors in dfs_successors.items(): | ||
for successor in successors: | ||
predecessors[successor].append(node_key) | ||
predecessors = dict(predecessors) | ||
# We need to figure out which edges we won't cross when doing the dfs. | ||
# These are the edges we'll need to add to the smiles using ring markers. | ||
edges = set() | ||
for n_idx, n_jdxs in dfs_successors.items(): | ||
for n_jdx in n_jdxs: | ||
edges.add(frozenset((n_idx, n_jdx))) | ||
total_edges = set(map(frozenset, molecule.edges)) | ||
ring_edges = list(total_edges - edges) | ||
# in cgsmiles graphs only bonds of order 1 and 2 | ||
# exists; order 2 means we have a ring at the | ||
# higher resolution. These orders are therefore | ||
# represented as rings and that requires to | ||
# add them to the ring list | ||
if not smiles_format: | ||
for edge in molecule.edges: | ||
if molecule.edges[edge]['order'] != 1: | ||
for n in range(1, molecule.edges[edge]['order']): | ||
ring_edges.append(frozenset(edge)) | ||
|
||
atom_to_ring_idx = defaultdict(list) | ||
ring_idx_to_bond = {} | ||
ring_idx_to_marker = {} | ||
for ring_idx, (n_idx, n_jdx) in enumerate(ring_edges, 1): | ||
atom_to_ring_idx[n_idx].append(ring_idx) | ||
atom_to_ring_idx[n_jdx].append(ring_idx) | ||
ring_idx_to_bond[ring_idx] = (n_idx, n_jdx) | ||
fgrunewald marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
branch_depth = 0 | ||
branches = set() | ||
to_visit = [start] | ||
smiles = '' | ||
|
||
while to_visit: | ||
current = to_visit.pop() | ||
if current in branches: | ||
branch_depth += 1 | ||
smiles += '(' | ||
branches.remove(current) | ||
|
||
if current in predecessors: | ||
# It's not the first atom we're visiting, so we want to see if the | ||
# edge we last crossed to get here is interesting. | ||
previous = predecessors[current] | ||
assert len(previous) == 1 | ||
previous = previous[0] | ||
if smiles_format and _write_edge_symbol(molecule, previous, current): | ||
order = molecule.edges[previous, current].get('order', 1) | ||
smiles += order_to_symbol[order] | ||
|
||
if smiles_format: | ||
smiles += format_atom(molecule, current, default_element) | ||
else: | ||
smiles += format_node(molecule, current) | ||
|
||
# we add the bonding descriptors if there are any | ||
if molecule.nodes[current].get('bonding', False): | ||
smiles += format_bonding(molecule.nodes[current]['bonding']) | ||
|
||
if current in atom_to_ring_idx: | ||
# We're going to need to write a ring number | ||
ring_idxs = atom_to_ring_idx[current] | ||
for ring_idx in ring_idxs: | ||
ring_bond = ring_idx_to_bond[ring_idx] | ||
if ring_idx not in ring_idx_to_marker: | ||
marker = _get_ring_marker(ring_idx_to_marker.values()) | ||
ring_idx_to_marker[ring_idx] = marker | ||
new_marker = True | ||
else: | ||
marker = ring_idx_to_marker.pop(ring_idx) | ||
new_marker = False | ||
|
||
if smiles_format and _write_edge_symbol(molecule, *ring_bond) and new_marker: | ||
order = molecule.edges[ring_bond].get('order', 1) | ||
smiles += order_to_symbol[order] | ||
|
||
smiles += str(marker) if marker < 10 else '%{}'.format(marker) | ||
|
||
if current in dfs_successors: | ||
# Proceed to the next node in this branch | ||
next_nodes = dfs_successors[current] | ||
# ... and if needed, remember to return here later | ||
branches.update(next_nodes[1:]) | ||
to_visit.extend(next_nodes) | ||
elif branch_depth: | ||
# We're finished with this branch. | ||
smiles += ')' | ||
branch_depth -= 1 | ||
|
||
smiles += ')' * branch_depth | ||
return smiles | ||
|
||
def write_cgsmiles_graph(molecule): | ||
""" | ||
Write a CGSmiles graph sans fragments at | ||
different resolution. | ||
|
||
Parameters | ||
---------- | ||
molecule: nx.Graph | ||
a molecule where each node as a fragname attribute | ||
that is used as name in the CGSmiles string. | ||
|
||
Returns | ||
------- | ||
str | ||
the CGSmiles string | ||
""" | ||
|
||
cgsmiles_str = write_graph(molecule) | ||
return "{" + cgsmiles_str + "}" | ||
|
||
def write_cgsmiles_fragments(fragment_dict, smiles_format=True): | ||
""" | ||
Write fragments of molecule graph. To identify the fragments | ||
all nodes with the same `fragname` and `fragid` attributes | ||
are considered as fragment. Bonding between fragments is | ||
extracted from the `bonding` edge attributes. | ||
|
||
Parameters | ||
---------- | ||
fragment_dict: dict[str, nx.Graph] | ||
a dict of fragment graphs | ||
smiles_format: bool | ||
write all atom SMILES if True (default) otherwise | ||
write CGSmiles | ||
|
||
Returns | ||
------- | ||
str | ||
""" | ||
fragment_str = "" | ||
for fragname, frag_graph in fragment_dict.items(): | ||
fragment_str += f"#{fragname}=" | ||
# format graph depending on resolution | ||
fragment_str += write_graph(frag_graph, smiles_format=smiles_format) + "," | ||
fragment_str = "{" + fragment_str[:-1] + "}" | ||
return fragment_str | ||
|
||
def write_cgsmiles(molecule_graph, fragments, last_all_atom=True): | ||
""" | ||
Write a CGSmiles string given a low resolution molecule graph | ||
and any number of higher resolutions provided as fragment dicts. | ||
|
||
Parameters | ||
---------- | ||
molecule_graph: nx.Graph | ||
fragments: list[dict[nx.Graph]] | ||
a list of fragment dicts | ||
last_all_atom: bool | ||
if the last set of fragments is at the all_atom level | ||
|
||
Returns | ||
------- | ||
str | ||
CGSmiles string | ||
""" | ||
final_str = write_cgsmiles_graph(molecule) | ||
for layer, fragment in enumerate(fragments): | ||
all_atom = (layer == len(fragments)-1) and last_all_atom | ||
fragment_str = write_cgsmiles_fragments(fragment, smiles_format=all_atom) | ||
final_str += "." + fragment_str | ||
return final_str |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it possible to elide aromatic bonds between aromatic atoms?
I prefer string formatting over concatenation personally: