diff --git a/polyply/src/big_smile_mol_processor.py b/polyply/src/big_smile_mol_processor.py new file mode 100644 index 00000000..1801a437 --- /dev/null +++ b/polyply/src/big_smile_mol_processor.py @@ -0,0 +1,143 @@ +import re +import networkx as nx +import pysmiles +from polyply.src.big_smile_parsing import (res_pattern_to_meta_mol, + force_field_from_fragments) +from polyply.src.map_to_molecule import MapToMolecule + +VALENCES = pysmiles.smiles_helper.VALENCES +VALENCES.update({"H":(1,)}) + +def compatible(left, right): + """ + Check bonding descriptor compatibility according + to the BigSmiles syntax convetions. + + Parameters + ---------- + left: str + right: str + + Returns + ------- + bool + """ + if left == right and left not in '> <': + return True + l, r = left[0], right[0] + if (l, r) == ('<', '>') or (l, r) == ('>', '<'): + return left[1:] == right[1:] + return False + +def generate_edge(source, target, bond_attribute="bonding"): + """ + Given a source and a target graph, which have bonding + descriptors stored as node attributes, find a pair of + matching descriptors and return the respective nodes. + The function also returns the bonding descriptors. If + no bonding descriptor is found an instance of LookupError + is raised. + + Parameters + ---------- + source: :class:`nx.Graph` + target: :class:`nx.Graph` + bond_attribute: `abc.hashable` + under which attribute are the bonding descriptors + stored. + + Returns + ------- + ((abc.hashable, abc.hashable), (str, str)) + the nodes as well as bonding descriptors + + Raises + ------ + LookupError + if no match is found + """ + source_nodes = nx.get_node_attributes(source, bond_attribute) + target_nodes = nx.get_node_attributes(target, bond_attribute) + for source_node in source_nodes: + for target_node in target_nodes: + #print(source_node, target_node) + bond_sources = source_nodes[source_node] + bond_targets = target_nodes[target_node] + for bond_source in bond_sources: + for bond_target in bond_targets: + #print(bond_source, bond_target) + if compatible(bond_source, bond_target): + return ((source_node, target_node), (bond_source, bond_target)) + raise LookupError + +class DefBigSmileParser: + """ + Parse an a string instance of a defined BigSmile, + which describes a polymer molecule. + """ + + def __init__(self, force_field): + self.force_field = force_field + self.meta_molecule = None + self.molecule = None + + def edges_from_bonding_descrpt(self): + """ + Make edges according to the bonding descriptors stored + in the node attributes of meta_molecule residue graph. + If a bonding descriptor is consumed it is removed from the list, + however, the meta_molecule edge gets an attribute with the + bonding descriptors that formed the edge. Later uncomsumed + bonding descriptors are replaced by hydrogen atoms. + """ + for prev_node, node in nx.dfs_edges(self.meta_molecule): + prev_graph = self.meta_molecule.nodes[prev_node]['graph'] + node_graph = self.meta_molecule.nodes[node]['graph'] + edge, bonding = generate_edge(prev_graph, + node_graph) + # this is a bit of a workaround because at this stage the + # bonding list is actually shared between all residues of + # of the same type; so we first make a copy then we replace + # the list sans used bonding descriptor + prev_bond_list = prev_graph.nodes[edge[0]]['bonding'].copy() + prev_bond_list.remove(bonding[0]) + prev_graph.nodes[edge[0]]['bonding'] = prev_bond_list + node_bond_list = node_graph.nodes[edge[1]]['bonding'].copy() + node_bond_list.remove(bonding[1]) + node_graph.nodes[edge[1]]['bonding'] = node_bond_list + self.meta_molecule.molecule.add_edge(edge[0], edge[1], bonding=bonding) + + def replace_unconsumed_bonding_descrpt(self): + """ + We allow multiple bonding descriptors per atom, which + however, are not always consumed. In this case the left + over bonding descriptors are replaced by hydrogen atoms. + """ + for node in self.meta_molecule.nodes: + graph = self.meta_molecule.nodes[node]['graph'] + bonding = nx.get_node_attributes(graph, "bonding") + for node, bondings in bonding.items(): + element = graph.nodes[node]['element'] + hcount = VALENCES[element][0] -\ + self.meta_molecule.molecule.degree(node) + 1 + attrs = {attr: graph.nodes[node][attr] for attr in ['resname', 'resid']} + attrs['element'] = 'H' + for new_id in range(1, hcount): + new_node = len(self.meta_molecule.molecule.nodes) + 1 + graph.add_edge(node, new_node) + attrs['atomname'] = "H" + str(new_id + len(graph.nodes)) + graph.nodes[new_node].update(attrs) + self.meta_molecule.molecule.add_edge(node, new_node) + self.meta_molecule.molecule.nodes[new_node].update(attrs) + + def parse(self, big_smile_str): + res_pattern, residues = re.findall(r"\{[^\}]+\}", big_smile_str) + self.meta_molecule = res_pattern_to_meta_mol(res_pattern) + self.force_field = force_field_from_fragments(residues) + MapToMolecule(self.force_field).run_molecule(self.meta_molecule) + self.edges_from_bonding_descrpt() + self.replace_unconsumed_bonding_descrpt() + return self.meta_molecule + +# ToDo +# - clean copying of bond-list attributes L100 diff --git a/polyply/src/big_smile_parsing.py b/polyply/src/big_smile_parsing.py new file mode 100644 index 00000000..ec136bea --- /dev/null +++ b/polyply/src/big_smile_parsing.py @@ -0,0 +1,367 @@ +from collections import defaultdict +import re +import numpy as np +try: + import pysmiles +except ImportError as error: + msg = ("You are using a functionality that requires " + "the pysmiles package. Use pip install pysmiles ") + raise ImportError(msg) from error +import networkx as nx +from vermouth.forcefield import ForceField +from vermouth.molecule import Block +from polyply.src.meta_molecule import MetaMolecule + +PATTERNS = {"bond_anchor": "\[\$.*?\]", + "place_holder": "\[\#.*?\]", + "annotation": "\|.*?\|", + "fragment": r'#(\w+)=((?:\[.*?\]|[^,\[\]]+)*)', + "seq_pattern": r'\{([^}]*)\}(?:\.\{([^}]*)\})?'} + +def _find_next_character(string, chars, start): + for idx, token in enumerate(string[start:]): + if token in chars: + return idx+start + return np.inf + +def _expand_branch(meta_mol, current, anchor, recipe): + prev_node = anchor + for bdx, (resname, n_mon) in enumerate(recipe): + if bdx == 0: + anchor = current + for _ in range(0, n_mon): + connection = [(prev_node, current)] + meta_mol.add_monomer(current, + resname, + connection) + prev_node = current + current += 1 + prev_node = anchor + return meta_mol, current, prev_node + +def res_pattern_to_meta_mol(pattern): + """ + Generate a :class:`polyply.MetaMolecule` from a + pattern string describing a residue graph with the + simplified big-smile syntax. + + The syntax scheme consists of two curly braces + enclosing the residue graph sequence. It can contain + any enumeration of residues by writing them as if they + were smile atoms but the atomname is given by # + resname. + This input fomat can handle branching as well ,however, + macrocycles are currently not supported. + + General Pattern + '{' + [#resname_1][#resname_2]... + '}' + + In addition to plain enumeration any residue may be + followed by a '|' and an integer number that + specifies how many times the given residue should + be added within a sequence. For example, a pentamer + of PEO can be written as: + + {[#PEO][#PEO][#PEO][#PEO][#PEO]} + + or + + {[#PEO]|5} + + The block syntax also applies to branches. Here the convention + is that the complete branch including it's first anchoring + residue is repeated. For example, to generate a PMA-g-PEG + polymer containing 15 residues the following syntax is permitted: + + {[#PMA]([#PEO][#PEO])|5} + + Parameters + ---------- + pattern: str + a string describing the meta-molecule + + Returns + ------- + :class:`polyply.MetaMolecule` + """ + meta_mol = MetaMolecule() + current = 0 + # stores one or more branch anchors; each next + # anchor belongs to a nested branch + branch_anchor = [] + # used for storing composition protocol for + # for branches; each entry is a list of + # branches from extending from the anchor + # point + recipes = defaultdict(list) + # the previous node + prev_node = None + # do we have an open branch + branching = False + # each element in the for loop matches a pattern + # '[' + '#' + some alphanumeric name + ']' + for match in re.finditer(PATTERNS['place_holder'], pattern): + start, stop = match.span() + # we start a new branch when the residue is preceded by '(' + # as in ... ([#PEO] ... + if pattern[start-1] == '(': + branching = True + branch_anchor.append(prev_node) + # the recipe for making the branch includes the anchor; which + # is hence the first residue in the list + recipes[branch_anchor[-1]] = [(meta_mol.nodes[prev_node]['resname'], 1)] + # here we check if the atom is followed by a expansion character '|' + # as in ... [#PEO]| + if stop < len(pattern) and pattern[stop] == '|': + # eon => end of next + # we find the next character that starts a new residue, ends a branch or + # ends the complete pattern + eon = _find_next_character(pattern, ['[', ')', '(', '}'], stop) + # between the expansion character and the eon character + # is any number that correspnds to the number of times (i.e. monomers) + # that this atom should be added + n_mon = int(pattern[stop+1:eon]) + else: + n_mon = 1 + + # the resname starts at the second character and ends + # one before the last according to the above pattern + resname = match.group(0)[2:-1] + # if this residue is part of a branch we store it in + # the recipe dict together with the anchor residue + # and expansion number + if branching: + recipes[branch_anchor[-1]].append((resname, n_mon)) + + # new we add new residue as often as required + connection = [] + for _ in range(0, n_mon): + if prev_node is not None: + connection = [(prev_node, current)] + meta_mol.add_monomer(current, + resname, + connection) + prev_node = current + current += 1 + + # here we check if the residue considered before is the + # last residue of a branch (i.e. '...[#residue])' + # that is the case if the branch closure comes before + # any new atom begins + branch_stop = _find_next_character(pattern, ['['], stop) >\ + _find_next_character(pattern, [')'], stop) + + # if the branch ends we reset the anchor + # and set branching False unless we are in + # a nested branch + if stop <= len(pattern) and branch_stop: + branching = False + prev_node = branch_anchor.pop() + if branch_anchor: + branching = True + #======================================== + # expansion for branches + #======================================== + # We need to know how often the branch has + # to be added so we first identify the branch + # terminal character ')' called eon_a. + eon_a = _find_next_character(pattern, [')'], stop) + # Then we check if the expansion character + # is next. + if stop+1 < len(pattern) and pattern[eon_a+1] == "|": + # If there is one we find the beginning + # of the next branch, residue or end of the string + # As before all characters inbetween are a number that + # is how often the branch is expanded. + eon_b = _find_next_character(pattern, ['[', ')', '(', '}'], eon_a+1) + # the outermost loop goes over how often a the branch has to be + # added to the existing sequence + for idx in range(0,int(pattern[eon_a+2:eon_b])-1): + prev_anchor = None + skip = 0 + # in principle each branch can contain any number of nested branches + # each branch is itself a recipe that has an anchor atom + for ref_anchor, recipe in list(recipes.items())[len(branch_anchor):]: + # starting from the first nested branch we have to do some + # math to find the anchor atom relative to the first branch + # we also skip the first residue in recipe, which is the + # anchor residue. Only the outermost branch in an expansion + # is expanded including the anchor. This allows easy description + # of graft polymers. + if prev_anchor: + offset = ref_anchor - prev_anchor + prev_node = prev_node + offset + skip = 1 + # this function simply adds the residues of the paticular + # branch + meta_mol, current, prev_node = _expand_branch(meta_mol, + current=current, + anchor=prev_node, + recipe=recipe[skip:]) + # if this is the first branch we want to set the anchor + # as the base anchor to which we jump back after all nested + # branches have been added + if prev_anchor is None: + base_anchor = prev_node + # store the previous anchor so we can do the math for nested + # branches + prev_anchor = ref_anchor + # all branches added; then go back to the base anchor + prev_node = base_anchor + # if all branches are done we need to reset the lists + # when all nested branches are completed + if len(branch_anchor) == 0: + recipes = defaultdict(list) + return meta_mol + +def tokenize_big_smile(big_smile): + """ + Processes a BigSmile string by storing the + the BigSmile specific bonding descriptors + in a dict with reference to the atom they + refer to. Furthermore, a cleaned smile + string is generated with the BigSmile + specific syntax removed. + + Parameters + ---------- + smile: str + a BigSmile smiles string + + Returns + ------- + str + a canonical smiles string + dict + a dict mapping bonding descriptors + to the nodes within the smiles string + """ + smile_iter = iter(big_smile) + bonding_descrpt = defaultdict(list) + smile = "" + node_count = 0 + prev_node = 0 + for token in smile_iter: + if token == '[': + peek = next(smile_iter) + if peek in ['$', '>', '<']: + bond_descrp = peek + peek = next(smile_iter) + while peek != ']': + bond_descrp += peek + peek = next(smile_iter) + bonding_descrpt[prev_node].append(bond_descrp) + else: + smile = smile + token + peek + prev_node = node_count + node_count += 1 + + elif token == '(': + anchor = prev_node + smile += token + elif token == ')': + prev_node = anchor + smile += token + else: + if token not in '] H @ . - = # $ : / \\ + - %'\ + and not token.isdigit(): + prev_node = node_count + node_count += 1 + smile += token + return smile, bonding_descrpt + +def _rebuild_h_atoms(mol_graph): + # special hack around to fix + # pysmiles bug for a single + # atom molecule; we assume that the + # hcount is just wrong and set it to + # the valance number minus bonds minus + # bonding connectors + if len(mol_graph.nodes) == 1: + ele = mol_graph.nodes[0]['element'] + # for N and P we assume the regular valency + hcount = pysmiles.smiles_helper.VALENCES[ele][0] + if mol_graph.nodes[0].get('bonding', False): + hcount -= 1 + mol_graph.nodes[0]['hcount'] = hcount + else: + for node in mol_graph.nodes: + if mol_graph.nodes[node].get('bonding', False): + # get the degree + ele = mol_graph.nodes[0]['element'] + # hcoung is the valance minus the degree minus + # the number of bonding descriptors + hcount = pysmiles.smiles_helper.VALENCES[ele][0] -\ + mol_graph.degree(node) -\ + len(mol_graph.nodes[node]['bonding']) + + mol_graph.nodes[node]['hcount'] = hcount + + pysmiles.smiles_helper.add_explicit_hydrogens(mol_graph) + return mol_graph + +def fragment_iter(fragment_str): + """ + Iterates over fragments defined in a BigSmile string. + Fragments are named residues that consist of a single + smile string together with the BigSmile specific bonding + descriptors. The function returns the resname of a named + fragment as well as a plain nx.Graph of the molecule + described by the smile. Bonding descriptors are annotated + as node attributes with the keyword bonding. + + Parameters + ---------- + fragment_str: str + the string describing the fragments + + Yields + ------ + str, nx.Graph + """ + for fragment in fragment_str[1:-1].split(','): + delim = fragment.find('=', 0) + resname = fragment[1:delim] + big_smile = fragment[delim+1:] + smile, bonding_descrpt = tokenize_big_smile(big_smile) + + if smile == "H": + mol_graph = nx.Graph() + mol_graph.add_node(0, element="H", bonding=bonding_descrpt[0]) + nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding') + else: + mol_graph = pysmiles.read_smiles(smile) + nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding') + # we need to rebuild hydrogen atoms now + _rebuild_h_atoms(mol_graph) + + atomnames = {node[0]: node[1]['element']+str(node[0]) for node in mol_graph.nodes(data=True)} + nx.set_node_attributes(mol_graph, atomnames, 'atomname') + nx.set_node_attributes(mol_graph, resname, 'resname') + yield resname, mol_graph + +def force_field_from_fragments(fragment_str): + """ + Collects the fragments defined in a BigSmile string + as :class:`vermouth.molecule.Blocks` in a force-field + object. Bonding descriptors are annotated as node + attribtues. + + Parameters + ---------- + fragment_str: str + string using BigSmile fragment syntax + + Returns + ------- + :class:`vermouth.forcefield.ForceField` + """ + force_field = ForceField("big_smile_ff") + frag_iter = fragment_iter(fragment_str) + for resname, mol_graph in frag_iter: + mol_block = Block(mol_graph) + force_field.blocks[resname] = mol_block + return force_field + +# ToDos +# - remove special case hydrogen line 327ff +# - check rebuild_h and clean up diff --git a/polyply/tests/test_big_smile_mol_proc.py b/polyply/tests/test_big_smile_mol_proc.py new file mode 100644 index 00000000..b6fe8e03 --- /dev/null +++ b/polyply/tests/test_big_smile_mol_proc.py @@ -0,0 +1,106 @@ +import pytest +import networkx as nx +from vermouth.forcefield import ForceField +from polyply.src.big_smile_mol_processor import (DefBigSmileParser, + generate_edge) +#import matplotlib.pyplot as plt +@pytest.mark.parametrize('bonds_source, bonds_target, edge, btypes',( + # single bond source each + ({0: ["$"]}, + {3: ["$"]}, + (0, 3), + ('$', '$')), + # include a None + ({0: ["$"], 1: []}, + {3: ["$"]}, + (0, 3), + ('$', '$')), + # multiple sources one match + ({0: ['$1'], 2: ['$2']}, + {1: ['$2'], 3: ['$']}, + (2, 1), + ('$2', '$2')), + # left right selective bonding + ({0: ['$'], 1: ['>'], 3: ['<']}, + {0: ['>'], 1: ['$5']}, + (3, 0), + ('<', '>')), + # left right selective bonding + # with identifier + ({0: ['$'], 1: ['>'], 3: ['<1']}, + {0: ['>'], 1: ['$5'], 2: ['>1']}, + (3, 2), + ('<1', '>1')), + +)) +def test_generate_edge(bonds_source, bonds_target, edge, btypes): + source = nx.path_graph(5) + target = nx.path_graph(4) + nx.set_node_attributes(source, bonds_source, "bonding") + nx.set_node_attributes(target, bonds_target, "bonding") + new_edge, new_btypes = generate_edge(source, target, bond_attribute="bonding") + assert new_edge == edge + assert new_btypes == btypes + + +@pytest.mark.parametrize('smile, ref_nodes, ref_edges',( + # smiple linear seqeunce + ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]COC[$],#OHter=[$][O]}", + # 0 1 2 3 4 5 6 7 8 + [('OHter', 'O H'), ('PEO', 'C O C H H H H'), + # 9 10 11 12 13 14 15 16 17 + ('PEO', 'C O C H H H H'), ('OHter', 'O H')], + [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7), + (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13), + (11, 14), (11, 15), (11, 16), (16, 17)]), + # smiple linear seqeunce with ionic bond + ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]COC[$],#OHter=[$][O].[Na+]}", + # 0 1 2 3 4 5 6 7 8 + [('OHter', 'O Na'), ('PEO', 'C O C H H H H'), + # 9 10 11 12 13 14 15 16 17 + ('PEO', 'C O C H H H H'), ('OHter', 'O Na')], + [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7), + (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13), + (11, 14), (11, 15), (11, 16), (16, 17)]), + + # uncomsumed bonding IDs; note that this is not the same + # molecule as previous test case. Here one of the OH branches + # and replaces an CH2 group with CH-OH + ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[>][$1]COC[<],#OHter=[$1][O]}", + [('OHter', 'O H'), ('PEO', 'C O C H H H H'), + ('PEO', 'C O C H H H H'), ('OHter', 'O H')], + [(0, 1), (0, 2), (2, 3), (2, 5), (2, 10), (3, 4), + (4, 6), (4, 7), (4, 17), (8, 9), (8, 11), (8, 14), + (8, 18), (9, 10), (10, 12), (10, 13), (14, 15)]), + # simple branched sequence + ("{[#Hter][#PE]([#PEO][#Hter])[#PE]([#PEO][#Hter])[#Hter]}.{#Hter=[$]H,#PE=[$]CC[$][$],#PEO=[$]COC[$]}", + [('Hter', 'H'), ('PE', 'C C H H H'), ('PEO', 'C O C H H H H'), ('Hter', 'H'), + ('PE', 'C C H H H'), ('PEO', 'C O C H H H H'), ('Hter', 'H'), ('Hter', 'H')], + [(0, 1), (1, 2), (1, 3), (1, 4), (2, 5), (2, 6), (2, 14), (6, 7), (6, 9), (6, 10), (7, 8), + (8, 11), (8, 12), (8, 13), (14, 15), (14, 16), (14, 17), (15, 18), (15, 19), (15, 27), + (19, 20), (19, 22), (19, 23), (20, 21), (21, 24), (21, 25), (21, 26)]), + # something with a ring + # 012 34567 + # 890123456 + ("{[#Hter][#PS]|2[#Hter]}.{#PS=[$]CC[$]c1ccccc1,#Hter=[$]H}", + [('Hter', 'H'), ('PS', 'C C C C C C C C H H H H H H H H'), + ('PS', 'C C C C C C C C H H H H H H H H'), ('Hter', 'H')], + [(0, 1), (1, 2), (1, 9), (1, 10), (2, 3), (2, 11), (2, 17), + (3, 4), (3, 8), (4, 5), (4, 12), (5, 6), (5, 13), (6, 7), + (6, 14), (7, 8), (7, 15), (8, 16), (17, 18), (17, 25), + (17, 26), (18, 19), (18, 27), (18, 33), (19, 20), (19, 24), + (20, 21), (20, 28), (21, 22), (21, 29), (22, 23), (22, 30), + (23, 24), (23, 31), (24, 32)]), + +)) +def test_def_big_smile_parser(smile, ref_nodes, ref_edges): + ff = ForceField("new") + meta_mol = DefBigSmileParser(ff).parse(smile) +# nx.draw_networkx(meta_mol.molecule, with_labels=True, labels=nx.get_node_attributes(meta_mol.molecule, 'element')) +# plt.show() + for node, ref in zip(meta_mol.nodes, ref_nodes): + assert meta_mol.nodes[node]['resname'] == ref[0] + block_graph = meta_mol.nodes[node]['graph'] + elements = list(nx.get_node_attributes(block_graph, 'element').values()) + assert elements == ref[1].split() + assert sorted(meta_mol.molecule.edges) == sorted(ref_edges) diff --git a/polyply/tests/test_big_smile_parsing.py b/polyply/tests/test_big_smile_parsing.py new file mode 100644 index 00000000..5c1491b8 --- /dev/null +++ b/polyply/tests/test_big_smile_parsing.py @@ -0,0 +1,230 @@ +import pytest +import networkx as nx +from polyply.src.big_smile_parsing import (res_pattern_to_meta_mol, + tokenize_big_smile, + fragment_iter) + +@pytest.mark.parametrize('smile, nodes, edges',( + # smiple linear seqeunce + ("{[#PMA][#PEO][#PMA]}", + ["PMA", "PEO", "PMA"], + [(0, 1), (1, 2)]), + # simple branched sequence + ("{[#PMA][#PMA]([#PEO][#PEO])[#PMA]}", + ["PMA", "PMA", "PEO", "PEO", "PMA"], + [(0, 1), (1, 2), (2, 3), (1, 4)]), + # simple sequence two branches + ("{[#PMA][#PMA][#PMA]([#PEO][#PEO])([#CH3])[#PMA]}", + ["PMA", "PMA", "PMA", "PEO", "PEO", "CH3", "PMA"], + [(0, 1), (1, 2), (2, 3), (3, 4), (2, 5), (2, 6)]), + # simple linear sequence with expansion + ("{[#PMA]|3}", + ["PMA", "PMA", "PMA"], + [(0, 1), (1, 2)]), + # simple branch expension + ("{[#PMA]([#PEO][#PEO][#OHter])|3}", + ["PMA", "PEO", "PEO", "OHter", + "PMA", "PEO", "PEO", "OHter", + "PMA", "PEO", "PEO", "OHter"], + [(0, 1), (1, 2), (2, 3), + (0, 4), (4, 5), (5, 6), (6, 7), + (4, 8), (8, 9), (9, 10), (10, 11)] + ), + # nested branched with expansion + ("{[#PMA]([#PEO]|3)|2}", + ["PMA", "PEO", "PEO", "PEO", + "PMA", "PEO", "PEO", "PEO"], + [(0, 1), (1, 2), (2, 3), + (0, 4), (4, 5), (5, 6), (6, 7)] + ), + # nested braching + # 0 1 2 3 4 5 6 + ("{[#PMA][#PMA]([#PEO][#PEO]([#OH])[#PEO])[#PMA]}", + ["PMA", "PMA", "PEO", "PEO", "OH", + "PEO", "PMA"], + [(0, 1), (1, 2), (2, 3), + (3, 4), (3, 5), (1, 6)] + ), + # nested braching plus expansion + # 0 1 2 3 4/5 6 7 + ("{[#PMA][#PMA]([#PEO][#PEO]([#OH]|2)[#PEO])[#PMA]}", + ["PMA", "PMA", "PEO", "PEO", "OH", "OH", + "PEO", "PMA"], + [(0, 1), (1, 2), (2, 3), + (3, 4), (4, 5), (3, 6), (1, 7)] + ), + # nested braching plus expansion incl. branch + # 0 1 2 3 4 5 + # 6 7 8 9 10 11 + ("{[#PMA][#PMA]([#PEO][#PEO]([#OH])[#PEO])|2[#PMA]}", + ["PMA", "PMA", "PEO", "PEO", "OH", "PEO", + "PMA", "PEO", "PEO", "PEO", "OH", "PMA"], + [(0, 1), (1, 2), (2, 3), + (3, 4), (3, 5), (1, 6), (6, 7), (7, 8), + (8, 9), (8, 10), (6, 11)] + ), + # nested braching plus expansion of nested branch + # here the nested branch is expended + # 0 - 1 - 10 + # | + # 2 + # | + # 3 {- 5 - 7 } - 9 -> the expanded fragment + # | | | + # 4 6 8 + ("{[#PMA][#PMA]([#PEO][#PQ]([#OH])|3[#PEO])[#PMA]}", + ["PMA", "PMA", "PEO", "PQ", "OH", + "PQ", "OH", "PQ", "OH", "PEO", "PMA"], + [(0, 1), (1, 2), (1, 10), + (2, 3), (3, 4), (3, 5), (5, 6), + (5, 7), (7, 8), (7, 9)] + ), + # nested braching plus expansion of nested branch + # here the nested branch is expended and a complete + # new branch is added + # 11 13 + # | | + # 0 - 1 - 10 - 12 + # | + # 2 + # | + # 3 {- 5 - 7 } - 9 -> the expanded fragment + # | | | + # 4 6 8 + ("{[#PMA][#PMA]([#PEO][#PQ]([#OH])|3[#PEO])[#PMA]([#CH3])|2}", + ["PMA", "PMA", "PEO", "PQ", "OH", + "PQ", "OH", "PQ", "OH", "PEO", "PMA", "CH3", "PMA", "CH3"], + [(0, 1), (1, 2), (1, 10), + (2, 3), (3, 4), (3, 5), (5, 6), + (5, 7), (7, 8), (7, 9), (10, 11), (10, 12), (12, 13)] + ), +)) +def test_res_pattern_to_meta_mol(smile, nodes, edges): + """ + Test that the meta-molecule is correctly reproduced + from the simplified smile string syntax. + """ + meta_mol = res_pattern_to_meta_mol(smile) + assert len(meta_mol.edges) == len(edges) + for edge in edges: + assert meta_mol.has_edge(*edge) + resnames = nx.get_node_attributes(meta_mol, 'resname') + assert nodes == list(resnames.values()) + +@pytest.mark.parametrize('big_smile, smile, bonding',( + # smiple symmetric bonding + ("[$]COC[$]", + "COC", + {0: ["$"], 2: ["$"]}), + # simple symmetric but with explicit hydrogen + ("[$][CH2]O[CH2][$]", + "[CH2]O[CH2]", + {0: ["$"], 2: ["$"]}), + # smiple symmetric bonding; multiple descript + ("[$]COC[$][$1]", + "COC", + {0: ["$"], 2: ["$", "$1"]}), + # named different bonding descriptors + ("[$1]CCCC[$2]", + "CCCC", + {0: ["$1"], 3: ["$2"]}), + # ring and bonding descriptors + ("[$1]CC[$2]C1CCCCC1", + "CCC1CCCCC1", + {0: ["$1"], 1: ["$2"]}), + # bonding descript. after branch + ("C(COC[$1])[$2]CCC[$3]", + "C(COC)CCC", + {0: ["$2"], 3: ["$1"], 6: ["$3"]}), + # left rigth bonding desciptors + ("[>]COC[<]", + "COC", + {0: [">"], 2: ["<"]}) +)) +def test_tokenize_big_smile(big_smile, smile, bonding): + new_smile, new_bonding = tokenize_big_smile(big_smile) + assert new_smile == smile + assert new_bonding == bonding + +@pytest.mark.parametrize('fragment_str, nodes, edges',( + # single fragment + ("{#PEO=[$]COC[$]}", + {"PEO": ((0, {"atomname": "C0", "resname": "PEO", "bonding": ["$"], "element": "C"}), + (1, {"atomname": "O1", "resname": "PEO", "element": "O"}), + (2, {"atomname": "C2", "resname": "PEO", "bonding": ["$"], "element": "C"}), + (3, {"atomname": "H3", "resname": "PEO", "element": "H"}), + (4, {"atomname": "H4", "resname": "PEO", "element": "H"}), + (5, {"atomname": "H5", "resname": "PEO", "element": "H"}), + (6, {"atomname": "H6", "resname": "PEO", "element": "H"}), + )}, + {"PEO": [(0, 1), (1, 2), (0, 3), (0, 4), (2, 5), (2, 6)]}), + # single fragment but with explicit hydrogen in smiles + ("{#PEO=[$][CH2]O[CH2][$]}", + {"PEO": ((0, {"atomname": "C0", "resname": "PEO", "bonding": ["$"], "element": "C"}), + (1, {"atomname": "O1", "resname": "PEO", "element": "O"}), + (2, {"atomname": "C2", "resname": "PEO", "bonding": ["$"], "element": "C"}), + (3, {"atomname": "H3", "resname": "PEO", "element": "H"}), + (4, {"atomname": "H4", "resname": "PEO", "element": "H"}), + (5, {"atomname": "H5", "resname": "PEO", "element": "H"}), + (6, {"atomname": "H6", "resname": "PEO", "element": "H"}), + )}, + {"PEO": [(0, 1), (1, 2), (0, 3), (0, 4), (2, 5), (2, 6)]}), + # test NH3 terminal + ("{#AMM=N[$]}", + {"AMM": ((0, {"atomname": "N0", "resname": "AMM", "bonding": ["$"], "element": "N"}), + (1, {"atomname": "H1", "resname": "AMM", "element": "H"}), + (2, {"atomname": "H2", "resname": "AMM", "element": "H"}), + )}, + {"AMM": [(0, 1), (0, 2)]}), + # single fragment + 1 terminal (i.e. only 1 bonding descrpt + ("{#PEO=[$]COC[$],#OHter=[$][OH]}", + {"PEO": ((0, {"atomname": "C0", "resname": "PEO", "bonding": ["$"], "element": "C"}), + (1, {"atomname": "O1", "resname": "PEO", "element": "O"}), + (2, {"atomname": "C2", "resname": "PEO", "bonding": ["$"], "element": "C"}), + (3, {"atomname": "H3", "resname": "PEO", "element": "H"}), + (4, {"atomname": "H4", "resname": "PEO", "element": "H"}), + (5, {"atomname": "H5", "resname": "PEO", "element": "H"}), + (6, {"atomname": "H6", "resname": "PEO", "element": "H"}), + ), + "OHter": ((0, {"atomname": "O0", "resname": "OHter", "bonding": ["$"], "element": "O"}), + (1, {"atomname": "H1", "resname": "OHter", "element": "H"}))}, + {"PEO": [(0, 1), (1, 2), (0, 3), (0, 4), (2, 5), (2, 6)], + "OHter": [(0, 1)]}), + # single fragment + 1 terminal but multiple bond descritp. + # this adjust the hydrogen count + ("{#PEO=[$]COC[$][$1],#OHter=[$][OH]}", + {"PEO": ((0, {"atomname": "C0", "resname": "PEO", "bonding": ["$"], "element": "C"}), + (1, {"atomname": "O1", "resname": "PEO", "element": "O"}), + (2, {"atomname": "C2", "resname": "PEO", "bonding": ["$", "$1"], "element": "C"}), + (3, {"atomname": "H3", "resname": "PEO", "element": "H"}), + (4, {"atomname": "H4", "resname": "PEO", "element": "H"}), + (5, {"atomname": "H5", "resname": "PEO", "element": "H"}), + ), + "OHter": ((0, {"atomname": "O0", "resname": "OHter", "bonding": ["$"], "element": "O"}), + (1, {"atomname": "H1", "resname": "OHter", "element": "H"}))}, + {"PEO": [(0, 1), (1, 2), (0, 3), (0, 4), (2, 5)], + "OHter": [(0, 1)]}), + # single fragment + 1 terminal but multiple bond descritp. + # but explicit hydrogen in the smiles string + ("{#PEO=[$][CH2]O[CH2][$][$1],#OHter=[$][OH]}", + {"PEO": ((0, {"atomname": "C0", "resname": "PEO", "bonding": ["$"], "element": "C"}), + (1, {"atomname": "O1", "resname": "PEO", "element": "O"}), + (2, {"atomname": "C2", "resname": "PEO", "bonding": ["$", "$1"], "element": "C"}), + (3, {"atomname": "H3", "resname": "PEO", "element": "H"}), + (4, {"atomname": "H4", "resname": "PEO", "element": "H"}), + (5, {"atomname": "H5", "resname": "PEO", "element": "H"}), + ), + "OHter": ((0, {"atomname": "O0", "resname": "OHter", "bonding": ["$"], "element": "O"}), + (1, {"atomname": "H1", "resname": "OHter", "element": "H"}))}, + {"PEO": [(0, 1), (1, 2), (0, 3), (0, 4), (2, 5)], + "OHter": [(0, 1)]}), + +)) +def test_fragment_iter(fragment_str, nodes, edges): + for resname, mol_graph in fragment_iter(fragment_str): + assert len(mol_graph.nodes) == len(nodes[resname]) + for node, ref_node in zip(mol_graph.nodes(data=True), nodes[resname]): + assert node[0] == ref_node[0] + for key in ref_node[1]: + assert ref_node[1][key] == node[1][key] + assert sorted(mol_graph.edges) == sorted(edges[resname]) diff --git a/requirements-tests.txt b/requirements-tests.txt index 595a4902..03357910 100644 --- a/requirements-tests.txt +++ b/requirements-tests.txt @@ -4,3 +4,4 @@ pytest-cov pylint codecov tqdm +pysmiles