From 40b89af2fe29eda0c1d61123cdd4b2eb20318eb3 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Fri, 19 Jan 2024 10:44:59 +0100 Subject: [PATCH 01/25] infrastructure for big smile parsing --- polyply/src/big_smile_parsing.py | 222 +++++++++++++++++++++++++++++++ 1 file changed, 222 insertions(+) create mode 100644 polyply/src/big_smile_parsing.py diff --git a/polyply/src/big_smile_parsing.py b/polyply/src/big_smile_parsing.py new file mode 100644 index 00000000..72e504e6 --- /dev/null +++ b/polyply/src/big_smile_parsing.py @@ -0,0 +1,222 @@ +import re +import pysmiles +import networkx as nx +from vermouth.forcefield import ForceField +from vermouth.molecule import Block +from polyply.src.meta_molecule import MetaMolecule + +PATTERNS = {"bond_anchor": "\[\$.*?\]", + "place_holder": "\[\#.*?\]", + "annotation": "\|.*?\|", + "fragment": r'#(\w+)=((?:\[.*?\]|[^,\[\]]+)*)', + "seq_pattern": r'\{([^}]*)\}(?:\.\{([^}]*)\})?'} + +def res_pattern_to_meta_mol(pattern): + """ + Generate a :class:`polyply.MetaMolecule` from a + pattern string describing a residue graph with the + simplified big-smile syntax. + + The syntax scheme consists of two curly braces + enclosing the residue graph sequence. It can contain + any enumeration of residues by writing them as if they + were smile atoms but the atomname is given by # + resname. + This input fomat can handle branching as well ,however, + macrocycles are currently not supported. + + General Pattern + '{' + [#resname_1][#resname_2]... + '}' + + In addition to plain enumeration any residue may be + followed by a '|' and an integern number that + specifies how many times the given residue should + be added within a sequence. For example, a pentamer + of PEO can be written as: + + {[#PEO][#PEO][#PEO][#PEO][#PEO]} + + or + + {[#PEO]|5} + + The block syntax also applies to branches. Here the convetion + is that the complete branch including it's first anchoring + residue is repeated. For example, to generate a PMA-g-PEG + polymer the following syntax is permitted: + + {[#PMA]([#PEO][#PEO])|5} + + Parameters + ---------- + pattern: str + a string describing the meta-molecule + + Returns + ------- + :class:`polyply.MetaMolecule` + """ + meta_mol = MetaMolecule() + current = 0 + branch_anchor = 0 + prev_node = None + branching = False + for match in re.finditer(PATTERNS['place_holder'], pattern): + start, stop = match.span() + # new branch here + if pattern[start-1] == '(': + branching = True + branch_anchor = prev_node + recipie = [(meta_mol.nodes[prev_node]['resname'], 1)] + if stop < len(pattern) and pattern[stop] == '|': + n_mon = int(pattern[stop+1:pattern.find('[', stop)]) + else: + n_mon = 1 + + resname = match.group(0)[2:-1] + # collect all residues in branch + if branching: + recipie.append((resname, n_mon)) + + # add the new residue + connection = [] + for _ in range(0, n_mon): + if prev_node is not None: + connection = [(prev_node, current)] + meta_mol.add_monomer(current, + resname, + connection) + prev_node = current + current += 1 + + # terminate branch and jump back to anchor + if stop < len(pattern) and pattern[stop] == ')' and branching: + branching = False + prev_node = branch_anchor + # we have to multiply the branch n-times + if stop+1 < len(pattern) and pattern[stop+1] == "|": + for _ in range(0,int(pattern[stop+2:pattern.find('[', stop)])): + for bdx, (resname, n_mon) in enumerate(recipie): + if bdx == 0: + anchor = current + for _ in range(0, n_mon): + connection = [(prev_node, current)] + meta_mol.add_monomer(current, + resname, + connection) + prev_node = current + current += 1 + prev_node = anchor + return meta_mol + +def _big_smile_iter(smile): + for token in smile: + yield token + +def tokenize_big_smile(big_smile): + """ + Processes a BigSmile string by storing the + the BigSmile specific bonding descriptors + in a dict with refernce to the atom they + refer to. Furthermore, a cleaned smile + string is generated with the BigSmile + specific syntax removed. + + Parameters + ---------- + smile: str + a BigSmile smile string + + Returns + ------- + str + a canonical smile string + dict + a dict mapping bonding descriptors + to the nodes within the smile + """ + smile_iter = _big_smile_iter(big_smile) + bonding_descrpt = {} + smile = "" + node_count = 0 + prev_node = 0 + for token in smile_iter: + if token == '[': + peek = next(smile_iter) + if peek in ['$', '>', '<']: + bond_descrp = peek + peek = next(smile_iter) + while peek != ']': + bond_descrp += peek + peek = next(smile_iter) + bonding_descrpt[prev_node] = bond_descrp + else: + smile = smile + token + peek + prev_node = node_count + node_count += 1 + + elif token == '(': + anchor = prev_node + smile += token + elif token == ')': + prev_node = anchor + smile += token + else: + if token not in '@ . - = # $ : / \\ + - %': + prev_node = node_count + node_count += 1 + smile += token + return smile, bonding_descrpt + +def fragment_iter(fragment_str): + """ + Iterates over fragments defined in a BigSmile string. + Fragments are named residues that consist of a single + smile string together with the BigSmile specific bonding + descriptors. The function returns the resname of a named + fragment as well as a plain nx.Graph of the molecule + described by the smile. Bonding descriptors are annotated + as node attributes with the keyword bonding. + + Parameters + ---------- + fragment_str: str + the string describing the fragments + + Yields + ------ + str, nx.Graph + """ + for fragment in fragment_str[1:-1].split(','): + delim = fragment.find('=', 0) + resname = fragment[1:delim] + big_smile = fragment[delim+1:] + smile, bonding_descrpt = tokenize_big_smile(big_smile) + mol_graph = pysmiles.read_smiles(smile) + atomnames = [str(node[0])+node[1]['element'] for node in mol_graph.nodes(data=True) ] + nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding') + nx.set_node_attributes(mol_graph, atomnames, 'atomname') + nx.set_node_attributes(mol_graph, resname, 'resname') + yield resname, mol_graph + +def force_field_from_fragments(fragment_str): + """ + Collects the fragments defined in a BigSmile string + as :class:`vermouth.molecule.Blocks` in a force-field + object. Bonding descriptors are annotated as node + attribtues. + + Parameters + ---------- + fragment_str: str + string using BigSmile fragment syntax + + Returns + ------- + :class:`vermouth.forcefield.ForceField` + """ + force_field = ForceField("big_smile_ff") + frag_iter = fragment_iter(fragment_str) + for resname, mol_graph in frag_iter: + mol_block = Block(mol_graph) + force_field.blocks[resname] = mol_block + return forxe_field From 05ed0456919ad1865925f45b07d2e95396053734 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Fri, 19 Jan 2024 10:47:06 +0100 Subject: [PATCH 02/25] optional dep. for pysmiles --- polyply/src/big_smile_parsing.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/polyply/src/big_smile_parsing.py b/polyply/src/big_smile_parsing.py index 72e504e6..2ad65a7b 100644 --- a/polyply/src/big_smile_parsing.py +++ b/polyply/src/big_smile_parsing.py @@ -1,5 +1,10 @@ import re -import pysmiles +try: + import pysmiles +except ImportError: + msg = ("You are using a functionality that requires " + "the pysmiles package. Use pip install pysmiles ") + raise ImportError(msg) import networkx as nx from vermouth.forcefield import ForceField from vermouth.molecule import Block From 82a2acc2bc0aa6a3e47eb9f51e4d4db06f4f0dbe Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Fri, 19 Jan 2024 10:50:13 +0100 Subject: [PATCH 03/25] add a processor that reads a big smile string and returns a full metamolecule including edges. --- polyply/src/big_smile_mol_processsor.py | 99 +++++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 polyply/src/big_smile_mol_processsor.py diff --git a/polyply/src/big_smile_mol_processsor.py b/polyply/src/big_smile_mol_processsor.py new file mode 100644 index 00000000..8131e009 --- /dev/null +++ b/polyply/src/big_smile_mol_processsor.py @@ -0,0 +1,99 @@ +import networkx as nx +from polyply.src.big_smile_parsing import (res_pattern_to_meta_mol, + force_field_from_fragments) +from polyply.src.map_to_molecule import MapToMolecule + +def compatible(left, right): + """ + Check bonding descriptor compatibility according + to the BigSmiles syntax convetions. + + Parameters + ---------- + left: str + right: str + + Returns + ------- + bool + """ + if left == right: + return True + if left[0] == "<" and right[0] == ">": + if left[1:] == right[1:]: + return True + if left[0] == ">" and right[0] == "<": + if left[1:] == right[1:]: + return True + return False + +def generate_edge(source, target, bond_type="bonding"): + """ + Given a source and a target graph, which have bonding + descriptors stored as node attributes, find a pair of + matching descriptors and return the respective nodes. + The function also returns the bonding descriptors. If + no bonding descriptor is found an instance of LookupError + is raised. + + Parameters + ---------- + source: :class:`nx.Graph` + target: :class:`nx.Graph` + bond_type: `abc.hashable` + under which attribute are the bonding descriptors + stored. + + Returns + ------- + ((abc.hashable, abc.hashable), (str, str)) + the nodes as well as bonding descriptors + + Raises + ------ + LookupError + if no match is found + """ + source_nodes = nx.get_node_attributes(source, bond_type) + target_nodes = nx.get_node_attributes(target, bond_type) + for source_node in source_nodes: + for target_node in target_nodes: + bond_source = source_nodes[source_node] + bond_target = target_nodes[target_node] + if compatible(bond_source, bond_target): + return ((source_node, target_node), (bond_source, bond_target)) + raise LookupError + +class DefBigSmileParser: + """ + Parse an a string instance of a defined BigSmile, + which describes a polymer molecule. + """ + + def __init__(self): + self.force_field = None + self.meta_molecule = None + self.molecule = None + + def edges_from_bonding_descrpt(self): + """ + Make edges according to the bonding descriptors stored + in the node attributes of meta_molecule residue graph. + If a bonding descriptor is consumed it is set to None, + however, the meta_molecule edge gets an attribute with the + bonding descriptors that formed the edge. + """ + for prev_node, node in nx.dfs_edges(self.meta_molecule): + edge, bonding = generate_edge(self.meta_molecule.nodes[prev_node]['graph'], + self.meta_molecule.nodes[node]['graph']) + self.meta_molecule.nodes[prev_node]['graph'][edge[0]]['bonding'] = None + self.meta_molecule.nodes[prev_node]['graph'][edge[1]]['bonding'] = None + self.meta_molecule.molecule.add_edge(edge, bonding=bonding) + + def parse(self, big_smile_str): + res_pattern, residues = big_smile_str.split('.') + self.meta_molecule = res_pattern_to_meta_mol(res_pattern) + self.force_field = force_field_from_fragments(residues) + MapToMolecule(self.force_field).run_molecule(self.meta_molecule) + self.edges_from_bonding_descrpt() + return self.meta_molecule From 257b76b7665355d217dfb0b71249e64255096a35 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Sat, 20 Jan 2024 15:43:12 +0100 Subject: [PATCH 04/25] atest-big-smile parsing part I --- polyply/tests/test_big_smile_parsing.py | 64 +++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 polyply/tests/test_big_smile_parsing.py diff --git a/polyply/tests/test_big_smile_parsing.py b/polyply/tests/test_big_smile_parsing.py new file mode 100644 index 00000000..43045a83 --- /dev/null +++ b/polyply/tests/test_big_smile_parsing.py @@ -0,0 +1,64 @@ +import pytest +import networkx as nx +from polyply.src.big_smile_parsing import (res_pattern_to_meta_mol, + tokenize_big_smile) + +@pytest.mark.parametrize('smile, nodes, edges',( + # smiple linear seqeunce + ("{[#PMA][#PEO][#PMA]}", + ["PMA", "PEO", "PMA"], + [(0, 1), (1, 2)]), + # simple branched sequence + ("{[#PMA][#PMA]([#PEO][#PEO])[#PMA]}", + ["PMA", "PMA", "PEO", "PEO", "PMA"], + [(0, 1), (1, 2), (2, 3), (1, 4)]), + # simple sequence two branches + ("{[#PMA][#PMA][#PMA]([#PEO][#PEO])([#CH3])[#PMA]}", + ["PMA", "PMA", "PMA", "PEO", "PEO", "CH3", "PMA"], + [(0, 1), (1, 2), (2, 3), (3, 4), (2, 5), (2, 6)]), + # simple linear sequence with expansion + ("{[#PMA]|3}", + ["PMA", "PMA", "PMA"], + [(0, 1), (1, 2)]), + ## simple branched with expansion + #("{[#PMA]([#PEO]|3)|2}", + #["PMA", "PEO", "PEO", "PEO", + # "PMA", "PEO", "PEO", "PEO"], + #[(0, 1), (1, 2), (2, 3), + # (0, 4), (4, 5), (5, 6), (6, 7)] + # ) +)) +def test_res_pattern_to_meta_mol(smile, nodes, edges): + """ + Test that the meta-molecule is correctly reproduced + from the simplified smile string syntax. + """ + meta_mol = res_pattern_to_meta_mol(smile) + assert len(meta_mol.edges) == len(edges) + for edge in edges: + assert meta_mol.has_edge(*edge) + resnames = nx.get_node_attributes(meta_mol, 'resname') + assert nodes == list(resnames.values()) + +@pytest.mark.parametrize('big_smile, smile, bonding',( + # smiple symmetric bonding + ("[$]COC[$]", + "COC", + {0: '$', 2: '$'}), + # named different bonding descriptors + ("[$1]CCCC[$2]", + "CCCC", + {0: "$1", 3: "$2"}), + # bonding descript. after branch + ("C(COC[$1])[$2]CCC[$3]", + "C(COC)CCC", + {0: '$2', 3: '$1', 6: '$3'}), + # left rigth bonding desciptors + ("[>]COC[<]", + "COC", + {0: '>', 2: '<'}) +)) +def test_tokenize_big_smile(big_smile, smile, bonding): + new_smile, new_bonding = tokenize_big_smile(big_smile) + assert new_smile == smile + assert new_bonding == bonding From 061c8efc60931b2fff5ef4c866b0c3a952ebded9 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Mon, 22 Jan 2024 16:37:32 +0100 Subject: [PATCH 05/25] fix hcount for single atom; fix nexted branches --- polyply/src/big_smile_parsing.py | 54 +++++++++++++++++++++++++++----- 1 file changed, 47 insertions(+), 7 deletions(-) diff --git a/polyply/src/big_smile_parsing.py b/polyply/src/big_smile_parsing.py index 2ad65a7b..ddb9bd2a 100644 --- a/polyply/src/big_smile_parsing.py +++ b/polyply/src/big_smile_parsing.py @@ -1,4 +1,5 @@ import re +import numpy as np try: import pysmiles except ImportError: @@ -16,6 +17,12 @@ "fragment": r'#(\w+)=((?:\[.*?\]|[^,\[\]]+)*)', "seq_pattern": r'\{([^}]*)\}(?:\.\{([^}]*)\})?'} +def _find_next_character(string, chars, start): + for idx, token in enumerate(string[start:]): + if token in chars: + return idx+start + return np.inf + def res_pattern_to_meta_mol(pattern): """ Generate a :class:`polyply.MetaMolecule` from a @@ -67,13 +74,15 @@ def res_pattern_to_meta_mol(pattern): branching = False for match in re.finditer(PATTERNS['place_holder'], pattern): start, stop = match.span() + print(pattern[start:stop]) # new branch here if pattern[start-1] == '(': branching = True branch_anchor = prev_node recipie = [(meta_mol.nodes[prev_node]['resname'], 1)] if stop < len(pattern) and pattern[stop] == '|': - n_mon = int(pattern[stop+1:pattern.find('[', stop)]) + eon = _find_next_character(pattern, ['[', ')', '(', '}'], stop) + n_mon = int(pattern[stop+1:eon]) else: n_mon = 1 @@ -94,12 +103,17 @@ def res_pattern_to_meta_mol(pattern): current += 1 # terminate branch and jump back to anchor - if stop < len(pattern) and pattern[stop] == ')' and branching: + branch_stop = _find_next_character(pattern, ['['], stop) >\ + _find_next_character(pattern, [')'], stop) + if stop <= len(pattern) and branch_stop and branching: branching = False prev_node = branch_anchor # we have to multiply the branch n-times - if stop+1 < len(pattern) and pattern[stop+1] == "|": - for _ in range(0,int(pattern[stop+2:pattern.find('[', stop)])): + eon_a = _find_next_character(pattern, [')'], stop) + if stop+1 < len(pattern) and pattern[eon_a+1] == "|": + eon_b = _find_next_character(pattern, ['[', ')', '(', '}'], eon_a+1) + # -1 because one branch has already been added at this point + for _ in range(0,int(pattern[eon_a+2:eon_b])-1): for bdx, (resname, n_mon) in enumerate(recipie): if bdx == 0: anchor = current @@ -166,12 +180,36 @@ def tokenize_big_smile(big_smile): prev_node = anchor smile += token else: - if token not in '@ . - = # $ : / \\ + - %': + if token not in '@ . - = # $ : / \\ + - %'\ + and not token.isdigit(): prev_node = node_count node_count += 1 smile += token return smile, bonding_descrpt +def _rebuild_h_atoms(mol_graph): + # special hack around to fix + # pysmiles bug for a single + # atom molecule; we assume that the + # hcount is just wrong and set it to + # the valance number minus bonds minus + # bonding connectors + if len(mol_graph.nodes) == 1: + ele = mol_graph.nodes[0]['element'] + # for N and P we assume the regular valency + hcount = pysmiles.smiles_helper.VALENCES[ele][0] + if mol_graph.nodes[0].get('bonding', False): + hcount -= 1 + mol_graph.nodes[0]['hcount'] = hcount + else: + for node in mol_graph.nodes: + if mol_graph.nodes[node].get('bonding', False): + hcount = mol_graph.nodes[node]['hcount'] + mol_graph.nodes[node]['hcount'] = hcount - 1 + + pysmiles.smiles_helper.add_explicit_hydrogens(mol_graph) + return mol_graph + def fragment_iter(fragment_str): """ Iterates over fragments defined in a BigSmile string. @@ -197,8 +235,10 @@ def fragment_iter(fragment_str): big_smile = fragment[delim+1:] smile, bonding_descrpt = tokenize_big_smile(big_smile) mol_graph = pysmiles.read_smiles(smile) - atomnames = [str(node[0])+node[1]['element'] for node in mol_graph.nodes(data=True) ] nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding') + # we need to rebuild hydrogen atoms now + _rebuild_h_atoms(mol_graph) + atomnames = {node[0]: node[1]['element']+str(node[0]) for node in mol_graph.nodes(data=True)} nx.set_node_attributes(mol_graph, atomnames, 'atomname') nx.set_node_attributes(mol_graph, resname, 'resname') yield resname, mol_graph @@ -224,4 +264,4 @@ def force_field_from_fragments(fragment_str): for resname, mol_graph in frag_iter: mol_block = Block(mol_graph) force_field.blocks[resname] = mol_block - return forxe_field + return force_field From 20e2e4917500ef8621a696f4395494a3dfd5a6e8 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Mon, 22 Jan 2024 16:37:58 +0100 Subject: [PATCH 06/25] tests for smile iter and test nested branches --- polyply/tests/test_big_smile_parsing.py | 71 ++++++++++++++++++++++--- 1 file changed, 63 insertions(+), 8 deletions(-) diff --git a/polyply/tests/test_big_smile_parsing.py b/polyply/tests/test_big_smile_parsing.py index 43045a83..3265564c 100644 --- a/polyply/tests/test_big_smile_parsing.py +++ b/polyply/tests/test_big_smile_parsing.py @@ -1,7 +1,8 @@ import pytest import networkx as nx from polyply.src.big_smile_parsing import (res_pattern_to_meta_mol, - tokenize_big_smile) + tokenize_big_smile, + fragment_iter) @pytest.mark.parametrize('smile, nodes, edges',( # smiple linear seqeunce @@ -20,13 +21,20 @@ ("{[#PMA]|3}", ["PMA", "PMA", "PMA"], [(0, 1), (1, 2)]), - ## simple branched with expansion - #("{[#PMA]([#PEO]|3)|2}", - #["PMA", "PEO", "PEO", "PEO", - # "PMA", "PEO", "PEO", "PEO"], - #[(0, 1), (1, 2), (2, 3), - # (0, 4), (4, 5), (5, 6), (6, 7)] - # ) + # simple branch expension + ("{[#PMA]([#PEO][#PEO][#OHter])|2}", + ["PMA", "PEO", "PEO", "OHter", + "PMA", "PEO", "PEO", "OHter"], + [(0, 1), (1, 2), (2, 3), + (0, 4), (4, 5), (5, 6), (6, 7)] + ), + # nested branched with expansion + ("{[#PMA]([#PEO]|3)|2}", + ["PMA", "PEO", "PEO", "PEO", + "PMA", "PEO", "PEO", "PEO"], + [(0, 1), (1, 2), (2, 3), + (0, 4), (4, 5), (5, 6), (6, 7)] + ) )) def test_res_pattern_to_meta_mol(smile, nodes, edges): """ @@ -49,6 +57,10 @@ def test_res_pattern_to_meta_mol(smile, nodes, edges): ("[$1]CCCC[$2]", "CCCC", {0: "$1", 3: "$2"}), + # ring and bonding descriptors + ("[$1]CC[$2]C1CCCCC1", + "CCC1CCCCC1", + {0: "$1", 1: "$2"}), # bonding descript. after branch ("C(COC[$1])[$2]CCC[$3]", "C(COC)CCC", @@ -62,3 +74,46 @@ def test_tokenize_big_smile(big_smile, smile, bonding): new_smile, new_bonding = tokenize_big_smile(big_smile) assert new_smile == smile assert new_bonding == bonding + +@pytest.mark.parametrize('fragment_str, nodes, edges',( + # single fragment + ("{#PEO=[$]COC[$]}", + {"PEO": ((0, {"atomname": "C0", "resname": "PEO", "bonding": "$", "element": "C"}), + (1, {"atomname": "O1", "resname": "PEO", "element": "O"}), + (2, {"atomname": "C2", "resname": "PEO", "bonding": "$", "element": "C"}), + (3, {"atomname": "H3", "resname": "PEO", "element": "H"}), + (4, {"atomname": "H4", "resname": "PEO", "element": "H"}), + (5, {"atomname": "H5", "resname": "PEO", "element": "H"}), + (6, {"atomname": "H6", "resname": "PEO", "element": "H"}), + )}, + {"PEO": [(0, 1), (1, 2), (0, 3), (0, 4), (2, 5), (2, 6)]}), + # test NH3 terminal + ("{#AMM=N[$]}", + {"AMM": ((0, {"atomname": "N0", "resname": "AMM", "bonding": "$", "element": "N"}), + (1, {"atomname": "H1", "resname": "AMM", "element": "H"}), + (2, {"atomname": "H2", "resname": "AMM", "element": "H"}), + )}, + {"AMM": [(0, 1), (0, 2)]}), + # single fragment + 1 terminal (i.e. only 1 bonding descrpt + ("{#PEO=[$]COC[$],#OHter=[$][OH]}", + {"PEO": ((0, {"atomname": "C0", "resname": "PEO", "bonding": "$", "element": "C"}), + (1, {"atomname": "O1", "resname": "PEO", "element": "O"}), + (2, {"atomname": "C2", "resname": "PEO", "bonding": "$", "element": "C"}), + (3, {"atomname": "H3", "resname": "PEO", "element": "H"}), + (4, {"atomname": "H4", "resname": "PEO", "element": "H"}), + (5, {"atomname": "H5", "resname": "PEO", "element": "H"}), + (6, {"atomname": "H6", "resname": "PEO", "element": "H"}), + ), + "OHter": ((0, {"atomname": "O0", "resname": "OHter", "bonding": "$", "element": "O"}), + (1, {"atomname": "H1", "resname": "OHter", "element": "H"}))}, + {"PEO": [(0, 1), (1, 2), (0, 3), (0, 4), (2, 5), (2, 6)], + "OHter": [(0, 1)]}), +)) +def test_fragment_iter(fragment_str, nodes, edges): + for resname, mol_graph in fragment_iter(fragment_str): + assert len(mol_graph.nodes) == len(nodes[resname]) + for node, ref_node in zip(mol_graph.nodes(data=True), nodes[resname]): + assert node[0] == ref_node[0] + for key in ref_node[1]: + assert ref_node[1][key] == node[1][key] + assert sorted(mol_graph.edges) == sorted(edges[resname]) From f505129f98cdff951f9137a832cb604f44bce8f9 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Mon, 22 Jan 2024 16:40:17 +0100 Subject: [PATCH 07/25] add pysmiles to test requrm. --- requirements-tests.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements-tests.txt b/requirements-tests.txt index 595a4902..03357910 100644 --- a/requirements-tests.txt +++ b/requirements-tests.txt @@ -4,3 +4,4 @@ pytest-cov pylint codecov tqdm +pysmiles From 0c67ecc17c530c50aa98781fe7df0bb37324e983 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Mon, 22 Jan 2024 18:49:28 +0100 Subject: [PATCH 08/25] add tests for bonding descriptor evaluation --- polyply/tests/test_big_smile_mol_proc.py | 37 ++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 polyply/tests/test_big_smile_mol_proc.py diff --git a/polyply/tests/test_big_smile_mol_proc.py b/polyply/tests/test_big_smile_mol_proc.py new file mode 100644 index 00000000..7bcdf9f9 --- /dev/null +++ b/polyply/tests/test_big_smile_mol_proc.py @@ -0,0 +1,37 @@ +import pytest +import networkx as nx +from polyply.src.big_smile_mol_processor import (DefBigSmileParser, + generate_edge) + +@pytest.mark.parametrize('bonds_source, bonds_target, edge, btypes',( + # single bond source each + ({0: "$"}, + {3: "$"}, + (0, 3), + ('$', '$')), + # multiple sources one match + ({0: '$1', 2: '$2'}, + {1: '$2', 3: '$'}, + (2, 1), + ('$2', '$2')), + # left right selective bonding + ({0: '$', 1: '>', 3: '<'}, + {0: '>', 1: '$5'}, + (3, 0), + ('<', '>')), + # left right selective bonding + # with identifier + ({0: '$', 1: '>', 3: '<1'}, + {0: '>', 1: '$5', 2: '>1'}, + (3, 2), + ('<1', '>1')), + +)) +def test_generate_edge(bonds_source, bonds_target, edge, btypes): + source = nx.path_graph(5) + target = nx.path_graph(4) + nx.set_node_attributes(source, bonds_source, "bonding") + nx.set_node_attributes(target, bonds_target, "bonding") + new_edge, new_btypes = generate_edge(source, target, bond_type="bonding") + assert new_edge == edge + assert new_btypes == btypes From 52235c91887a87b1be7bea3d50902f9470e286e2 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Tue, 23 Jan 2024 18:57:46 +0100 Subject: [PATCH 09/25] add tests for big smile molecule prc --- polyply/tests/test_big_smile_mol_proc.py | 66 ++++++++++++++++++++---- 1 file changed, 57 insertions(+), 9 deletions(-) diff --git a/polyply/tests/test_big_smile_mol_proc.py b/polyply/tests/test_big_smile_mol_proc.py index 7bcdf9f9..58667ed8 100644 --- a/polyply/tests/test_big_smile_mol_proc.py +++ b/polyply/tests/test_big_smile_mol_proc.py @@ -2,27 +2,32 @@ import networkx as nx from polyply.src.big_smile_mol_processor import (DefBigSmileParser, generate_edge) - +import matplotlib.pyplot as plt @pytest.mark.parametrize('bonds_source, bonds_target, edge, btypes',( # single bond source each - ({0: "$"}, - {3: "$"}, + ({0: ["$"]}, + {3: ["$"]}, + (0, 3), + ('$', '$')), + # include a None + ({0: ["$"], 1: []}, + {3: ["$"]}, (0, 3), ('$', '$')), # multiple sources one match - ({0: '$1', 2: '$2'}, - {1: '$2', 3: '$'}, + ({0: ['$1'], 2: ['$2']}, + {1: ['$2'], 3: ['$']}, (2, 1), ('$2', '$2')), # left right selective bonding - ({0: '$', 1: '>', 3: '<'}, - {0: '>', 1: '$5'}, + ({0: ['$'], 1: ['>'], 3: ['<']}, + {0: ['>'], 1: ['$5']}, (3, 0), ('<', '>')), # left right selective bonding # with identifier - ({0: '$', 1: '>', 3: '<1'}, - {0: '>', 1: '$5', 2: '>1'}, + ({0: ['$'], 1: ['>'], 3: ['<1']}, + {0: ['>'], 1: ['$5'], 2: ['>1']}, (3, 2), ('<1', '>1')), @@ -35,3 +40,46 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes): new_edge, new_btypes = generate_edge(source, target, bond_type="bonding") assert new_edge == edge assert new_btypes == btypes + + +@pytest.mark.parametrize('smile, ref_nodes, ref_edges',( + # smiple linear seqeunce + ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]COC[$],#OHter=[$][O]}", + # 0 1 2 3 4 5 6 7 8 + [('OHter', 'O H'), ('PEO', 'C O C H H H H'), + # 9 10 11 12 13 14 15 16 17 + ('PEO', 'C O C H H H H'), ('OHter', 'O H')], + [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7), + (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13), + (11, 14), (11, 15), (11, 16), (16, 17)]), + # simple branched sequence + ("{[#Hter][#PE]([#PEO][#Hter])[#PE]([#PEO][#Hter])[#Hter]}.{#Hter=[$]H,#PE=[$]CC[$][$],#PEO=[$]COC[$]}", + [('Hter', 'H'), ('PE', 'C C H H H'), ('PEO', 'C O C H H H H'), ('Hter', 'H'), + ('PE', 'C C H H H'), ('PEO', 'C O C H H H H'), ('Hter', 'H'), ('Hter', 'H')], + [(0, 1), (1, 2), (1, 3), (1, 4), (2, 5), (2, 6), (2, 14), (6, 7), (6, 9), (6, 10), (7, 8), + (8, 11), (8, 12), (8, 13), (14, 15), (14, 16), (14, 17), (15, 18), (15, 19), (15, 27), + (19, 20), (19, 22), (19, 23), (20, 21), (21, 24), (21, 25), (21, 26)]), + # something with a ring + # 012 34567 + # 890123456 + ("{[#Hter][#PS]|2[#Hter]}.{#PS=[$]CC[$]c1ccccc1,#Hter=[$]H}", + [('Hter', 'H'), ('PS', 'C C C C C C C C H H H H H H H H'), + ('PS', 'C C C C C C C C H H H H H H H H'), ('Hter', 'H')], + [(0, 1), (1, 2), (1, 9), (1, 10), (2, 3), (2, 11), (2, 17), + (3, 4), (3, 8), (4, 5), (4, 12), (5, 6), (5, 13), (6, 7), + (6, 14), (7, 8), (7, 15), (8, 16), (17, 18), (17, 25), + (17, 26), (18, 19), (18, 27), (18, 33), (19, 20), (19, 24), + (20, 21), (20, 28), (21, 22), (21, 29), (22, 23), (22, 30), + (23, 24), (23, 31), (24, 32)]), + +)) +def test_def_big_smile_parser(smile, ref_nodes, ref_edges): + meta_mol = DefBigSmileParser().parse(smile) + for node, ref in zip(meta_mol.nodes, ref_nodes): + assert meta_mol.nodes[node]['resname'] == ref[0] + block_graph = meta_mol.nodes[node]['graph'] + elements = list(nx.get_node_attributes(block_graph, 'element').values()) + assert elements == ref[1].split() + #nx.draw_networkx(meta_mol.molecule, with_labels=True, labels=nx.get_node_attributes(meta_mol.molecule, 'element')) + #plt.show() + assert sorted(meta_mol.molecule.edges) == sorted(ref_edges) From 9a0a674fa685af88df1567d9051a0bbb308e80d4 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Tue, 23 Jan 2024 18:58:18 +0100 Subject: [PATCH 10/25] allow multiple bonding per atom; fix bugs --- polyply/src/big_smile_mol_processor.py | 117 +++++++++++++++++++++++++ polyply/src/big_smile_parsing.py | 22 +++-- 2 files changed, 132 insertions(+), 7 deletions(-) create mode 100644 polyply/src/big_smile_mol_processor.py diff --git a/polyply/src/big_smile_mol_processor.py b/polyply/src/big_smile_mol_processor.py new file mode 100644 index 00000000..8499e7e3 --- /dev/null +++ b/polyply/src/big_smile_mol_processor.py @@ -0,0 +1,117 @@ +import networkx as nx +from polyply.src.big_smile_parsing import (res_pattern_to_meta_mol, + force_field_from_fragments) +from polyply.src.map_to_molecule import MapToMolecule + +def compatible(left, right): + """ + Check bonding descriptor compatibility according + to the BigSmiles syntax convetions. + + Parameters + ---------- + left: str + right: str + + Returns + ------- + bool + """ + if left == right and left not in '> <': + return True + if left[0] == "<" and right[0] == ">": + if left[1:] == right[1:]: + return True + if left[0] == ">" and right[0] == "<": + if left[1:] == right[1:]: + return True + return False + +def generate_edge(source, target, bond_type="bonding"): + """ + Given a source and a target graph, which have bonding + descriptors stored as node attributes, find a pair of + matching descriptors and return the respective nodes. + The function also returns the bonding descriptors. If + no bonding descriptor is found an instance of LookupError + is raised. + + Parameters + ---------- + source: :class:`nx.Graph` + target: :class:`nx.Graph` + bond_type: `abc.hashable` + under which attribute are the bonding descriptors + stored. + + Returns + ------- + ((abc.hashable, abc.hashable), (str, str)) + the nodes as well as bonding descriptors + + Raises + ------ + LookupError + if no match is found + """ + source_nodes = nx.get_node_attributes(source, bond_type) + target_nodes = nx.get_node_attributes(target, bond_type) + for source_node in source_nodes: + for target_node in target_nodes: + #print(source_node, target_node) + bond_sources = source_nodes[source_node] + bond_targets = target_nodes[target_node] + for bond_source in bond_sources: + for bond_target in bond_targets: + #print(bond_source, bond_target) + if compatible(bond_source, bond_target): + return ((source_node, target_node), (bond_source, bond_target)) + raise LookupError + +class DefBigSmileParser: + """ + Parse an a string instance of a defined BigSmile, + which describes a polymer molecule. + """ + + def __init__(self): + self.force_field = None + self.meta_molecule = None + self.molecule = None + + def edges_from_bonding_descrpt(self): + """ + Make edges according to the bonding descriptors stored + in the node attributes of meta_molecule residue graph. + If a bonding descriptor is consumed it is set to None, + however, the meta_molecule edge gets an attribute with the + bonding descriptors that formed the edge. + """ + for prev_node, node in nx.dfs_edges(self.meta_molecule): + prev_graph = self.meta_molecule.nodes[prev_node]['graph'] + node_graph = self.meta_molecule.nodes[node]['graph'] + edge, bonding = generate_edge(prev_graph, + node_graph) + # this is a bit of a workaround because at this stage the + # bonding list is actually shared between all residues of + # of the same type; so we first make a copy then we replace + # the list sans used bonding descriptor + prev_bond_list = prev_graph.nodes[edge[0]]['bonding'].copy() + prev_bond_list.remove(bonding[0]) + prev_graph.nodes[edge[0]]['bonding'] = prev_bond_list + node_bond_list = node_graph.nodes[edge[1]]['bonding'].copy() + node_bond_list.remove(bonding[1]) + node_graph.nodes[edge[1]]['bonding'] = node_bond_list + self.meta_molecule.molecule.add_edge(edge[0], edge[1], bonding=bonding) + + def parse(self, big_smile_str): + res_pattern, residues = big_smile_str.split('.') + self.meta_molecule = res_pattern_to_meta_mol(res_pattern) + self.force_field = force_field_from_fragments(residues) + MapToMolecule(self.force_field).run_molecule(self.meta_molecule) + self.edges_from_bonding_descrpt() + return self.meta_molecule + +# ToDo +# - replace non consumed bonding descrpt by hydrogen +# - diff --git a/polyply/src/big_smile_parsing.py b/polyply/src/big_smile_parsing.py index ddb9bd2a..fa6348cc 100644 --- a/polyply/src/big_smile_parsing.py +++ b/polyply/src/big_smile_parsing.py @@ -1,3 +1,4 @@ +from collections import defaultdict import re import numpy as np try: @@ -154,7 +155,7 @@ def tokenize_big_smile(big_smile): to the nodes within the smile """ smile_iter = _big_smile_iter(big_smile) - bonding_descrpt = {} + bonding_descrpt = defaultdict(list) smile = "" node_count = 0 prev_node = 0 @@ -167,7 +168,7 @@ def tokenize_big_smile(big_smile): while peek != ']': bond_descrp += peek peek = next(smile_iter) - bonding_descrpt[prev_node] = bond_descrp + bonding_descrpt[prev_node].append(bond_descrp) else: smile = smile + token + peek prev_node = node_count @@ -205,7 +206,7 @@ def _rebuild_h_atoms(mol_graph): for node in mol_graph.nodes: if mol_graph.nodes[node].get('bonding', False): hcount = mol_graph.nodes[node]['hcount'] - mol_graph.nodes[node]['hcount'] = hcount - 1 + mol_graph.nodes[node]['hcount'] = hcount - len(mol_graph.nodes[node]['bonding']) pysmiles.smiles_helper.add_explicit_hydrogens(mol_graph) return mol_graph @@ -234,10 +235,17 @@ def fragment_iter(fragment_str): resname = fragment[1:delim] big_smile = fragment[delim+1:] smile, bonding_descrpt = tokenize_big_smile(big_smile) - mol_graph = pysmiles.read_smiles(smile) - nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding') - # we need to rebuild hydrogen atoms now - _rebuild_h_atoms(mol_graph) + + if smile == "H": + mol_graph = nx.Graph() + mol_graph.add_node(0, element="H", bonding=bonding_descrpt[0]) + nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding') + else: + mol_graph = pysmiles.read_smiles(smile) + nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding') + # we need to rebuild hydrogen atoms now + _rebuild_h_atoms(mol_graph) + atomnames = {node[0]: node[1]['element']+str(node[0]) for node in mol_graph.nodes(data=True)} nx.set_node_attributes(mol_graph, atomnames, 'atomname') nx.set_node_attributes(mol_graph, resname, 'resname') From ceccc3d53fab73921c87bfd29a885fda7e284726 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Wed, 24 Jan 2024 10:59:53 +0100 Subject: [PATCH 11/25] remove mpl import --- polyply/tests/test_big_smile_mol_proc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polyply/tests/test_big_smile_mol_proc.py b/polyply/tests/test_big_smile_mol_proc.py index 58667ed8..6975b885 100644 --- a/polyply/tests/test_big_smile_mol_proc.py +++ b/polyply/tests/test_big_smile_mol_proc.py @@ -2,7 +2,7 @@ import networkx as nx from polyply.src.big_smile_mol_processor import (DefBigSmileParser, generate_edge) -import matplotlib.pyplot as plt +#import matplotlib.pyplot as plt @pytest.mark.parametrize('bonds_source, bonds_target, edge, btypes',( # single bond source each ({0: ["$"]}, From 158fd3734f321d2084d7b466297dfe4c0c851d30 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Wed, 24 Jan 2024 11:12:13 +0100 Subject: [PATCH 12/25] add changed tests for multiple bonding per atom --- polyply/tests/test_big_smile_parsing.py | 40 ++++++++++++++++++------- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/polyply/tests/test_big_smile_parsing.py b/polyply/tests/test_big_smile_parsing.py index 3265564c..f7faf71a 100644 --- a/polyply/tests/test_big_smile_parsing.py +++ b/polyply/tests/test_big_smile_parsing.py @@ -52,23 +52,27 @@ def test_res_pattern_to_meta_mol(smile, nodes, edges): # smiple symmetric bonding ("[$]COC[$]", "COC", - {0: '$', 2: '$'}), + {0: ["$"], 2: ["$"]}), + # smiple symmetric bonding; multiple descript + ("[$]COC[$][$1]", + "COC", + {0: ["$"], 2: ["$", "$1"]}), # named different bonding descriptors ("[$1]CCCC[$2]", "CCCC", - {0: "$1", 3: "$2"}), + {0: ["$1"], 3: ["$2"]}), # ring and bonding descriptors ("[$1]CC[$2]C1CCCCC1", "CCC1CCCCC1", - {0: "$1", 1: "$2"}), + {0: ["$1"], 1: ["$2"]}), # bonding descript. after branch ("C(COC[$1])[$2]CCC[$3]", "C(COC)CCC", - {0: '$2', 3: '$1', 6: '$3'}), + {0: ["$2"], 3: ["$1"], 6: ["$3"]}), # left rigth bonding desciptors ("[>]COC[<]", "COC", - {0: '>', 2: '<'}) + {0: [">"], 2: ["<"]}) )) def test_tokenize_big_smile(big_smile, smile, bonding): new_smile, new_bonding = tokenize_big_smile(big_smile) @@ -78,9 +82,9 @@ def test_tokenize_big_smile(big_smile, smile, bonding): @pytest.mark.parametrize('fragment_str, nodes, edges',( # single fragment ("{#PEO=[$]COC[$]}", - {"PEO": ((0, {"atomname": "C0", "resname": "PEO", "bonding": "$", "element": "C"}), + {"PEO": ((0, {"atomname": "C0", "resname": "PEO", "bonding": ["$"], "element": "C"}), (1, {"atomname": "O1", "resname": "PEO", "element": "O"}), - (2, {"atomname": "C2", "resname": "PEO", "bonding": "$", "element": "C"}), + (2, {"atomname": "C2", "resname": "PEO", "bonding": ["$"], "element": "C"}), (3, {"atomname": "H3", "resname": "PEO", "element": "H"}), (4, {"atomname": "H4", "resname": "PEO", "element": "H"}), (5, {"atomname": "H5", "resname": "PEO", "element": "H"}), @@ -89,25 +93,39 @@ def test_tokenize_big_smile(big_smile, smile, bonding): {"PEO": [(0, 1), (1, 2), (0, 3), (0, 4), (2, 5), (2, 6)]}), # test NH3 terminal ("{#AMM=N[$]}", - {"AMM": ((0, {"atomname": "N0", "resname": "AMM", "bonding": "$", "element": "N"}), + {"AMM": ((0, {"atomname": "N0", "resname": "AMM", "bonding": ["$"], "element": "N"}), (1, {"atomname": "H1", "resname": "AMM", "element": "H"}), (2, {"atomname": "H2", "resname": "AMM", "element": "H"}), )}, {"AMM": [(0, 1), (0, 2)]}), # single fragment + 1 terminal (i.e. only 1 bonding descrpt ("{#PEO=[$]COC[$],#OHter=[$][OH]}", - {"PEO": ((0, {"atomname": "C0", "resname": "PEO", "bonding": "$", "element": "C"}), + {"PEO": ((0, {"atomname": "C0", "resname": "PEO", "bonding": ["$"], "element": "C"}), (1, {"atomname": "O1", "resname": "PEO", "element": "O"}), - (2, {"atomname": "C2", "resname": "PEO", "bonding": "$", "element": "C"}), + (2, {"atomname": "C2", "resname": "PEO", "bonding": ["$"], "element": "C"}), (3, {"atomname": "H3", "resname": "PEO", "element": "H"}), (4, {"atomname": "H4", "resname": "PEO", "element": "H"}), (5, {"atomname": "H5", "resname": "PEO", "element": "H"}), (6, {"atomname": "H6", "resname": "PEO", "element": "H"}), ), - "OHter": ((0, {"atomname": "O0", "resname": "OHter", "bonding": "$", "element": "O"}), + "OHter": ((0, {"atomname": "O0", "resname": "OHter", "bonding": ["$"], "element": "O"}), (1, {"atomname": "H1", "resname": "OHter", "element": "H"}))}, {"PEO": [(0, 1), (1, 2), (0, 3), (0, 4), (2, 5), (2, 6)], "OHter": [(0, 1)]}), + # single fragment + 1 terminal but multiple bond descritp. + # this adjust the hydrogen count + ("{#PEO=[$]COC[$][$1],#OHter=[$][OH]}", + {"PEO": ((0, {"atomname": "C0", "resname": "PEO", "bonding": ["$"], "element": "C"}), + (1, {"atomname": "O1", "resname": "PEO", "element": "O"}), + (2, {"atomname": "C2", "resname": "PEO", "bonding": ["$", "$1"], "element": "C"}), + (3, {"atomname": "H3", "resname": "PEO", "element": "H"}), + (4, {"atomname": "H4", "resname": "PEO", "element": "H"}), + (5, {"atomname": "H5", "resname": "PEO", "element": "H"}), + ), + "OHter": ((0, {"atomname": "O0", "resname": "OHter", "bonding": ["$"], "element": "O"}), + (1, {"atomname": "H1", "resname": "OHter", "element": "H"}))}, + {"PEO": [(0, 1), (1, 2), (0, 3), (0, 4), (2, 5)], + "OHter": [(0, 1)]}), )) def test_fragment_iter(fragment_str, nodes, edges): for resname, mol_graph in fragment_iter(fragment_str): From 8f2887f5d2149e94330014a8b32f47d64caf1b3d Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Wed, 24 Jan 2024 12:13:41 +0100 Subject: [PATCH 13/25] delete old processor file --- polyply/src/big_smile_mol_processsor.py | 99 ------------------------- 1 file changed, 99 deletions(-) delete mode 100644 polyply/src/big_smile_mol_processsor.py diff --git a/polyply/src/big_smile_mol_processsor.py b/polyply/src/big_smile_mol_processsor.py deleted file mode 100644 index 8131e009..00000000 --- a/polyply/src/big_smile_mol_processsor.py +++ /dev/null @@ -1,99 +0,0 @@ -import networkx as nx -from polyply.src.big_smile_parsing import (res_pattern_to_meta_mol, - force_field_from_fragments) -from polyply.src.map_to_molecule import MapToMolecule - -def compatible(left, right): - """ - Check bonding descriptor compatibility according - to the BigSmiles syntax convetions. - - Parameters - ---------- - left: str - right: str - - Returns - ------- - bool - """ - if left == right: - return True - if left[0] == "<" and right[0] == ">": - if left[1:] == right[1:]: - return True - if left[0] == ">" and right[0] == "<": - if left[1:] == right[1:]: - return True - return False - -def generate_edge(source, target, bond_type="bonding"): - """ - Given a source and a target graph, which have bonding - descriptors stored as node attributes, find a pair of - matching descriptors and return the respective nodes. - The function also returns the bonding descriptors. If - no bonding descriptor is found an instance of LookupError - is raised. - - Parameters - ---------- - source: :class:`nx.Graph` - target: :class:`nx.Graph` - bond_type: `abc.hashable` - under which attribute are the bonding descriptors - stored. - - Returns - ------- - ((abc.hashable, abc.hashable), (str, str)) - the nodes as well as bonding descriptors - - Raises - ------ - LookupError - if no match is found - """ - source_nodes = nx.get_node_attributes(source, bond_type) - target_nodes = nx.get_node_attributes(target, bond_type) - for source_node in source_nodes: - for target_node in target_nodes: - bond_source = source_nodes[source_node] - bond_target = target_nodes[target_node] - if compatible(bond_source, bond_target): - return ((source_node, target_node), (bond_source, bond_target)) - raise LookupError - -class DefBigSmileParser: - """ - Parse an a string instance of a defined BigSmile, - which describes a polymer molecule. - """ - - def __init__(self): - self.force_field = None - self.meta_molecule = None - self.molecule = None - - def edges_from_bonding_descrpt(self): - """ - Make edges according to the bonding descriptors stored - in the node attributes of meta_molecule residue graph. - If a bonding descriptor is consumed it is set to None, - however, the meta_molecule edge gets an attribute with the - bonding descriptors that formed the edge. - """ - for prev_node, node in nx.dfs_edges(self.meta_molecule): - edge, bonding = generate_edge(self.meta_molecule.nodes[prev_node]['graph'], - self.meta_molecule.nodes[node]['graph']) - self.meta_molecule.nodes[prev_node]['graph'][edge[0]]['bonding'] = None - self.meta_molecule.nodes[prev_node]['graph'][edge[1]]['bonding'] = None - self.meta_molecule.molecule.add_edge(edge, bonding=bonding) - - def parse(self, big_smile_str): - res_pattern, residues = big_smile_str.split('.') - self.meta_molecule = res_pattern_to_meta_mol(res_pattern) - self.force_field = force_field_from_fragments(residues) - MapToMolecule(self.force_field).run_molecule(self.meta_molecule) - self.edges_from_bonding_descrpt() - return self.meta_molecule From 7cb3b4cf32fc8eb52b42b4b8a795fafb3cf7faa6 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Sun, 3 Mar 2024 14:30:49 +0100 Subject: [PATCH 14/25] add H and ] as special characters in big smile parser --- polyply/src/big_smile_parsing.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/polyply/src/big_smile_parsing.py b/polyply/src/big_smile_parsing.py index fa6348cc..6969a31c 100644 --- a/polyply/src/big_smile_parsing.py +++ b/polyply/src/big_smile_parsing.py @@ -75,7 +75,6 @@ def res_pattern_to_meta_mol(pattern): branching = False for match in re.finditer(PATTERNS['place_holder'], pattern): start, stop = match.span() - print(pattern[start:stop]) # new branch here if pattern[start-1] == '(': branching = True @@ -181,7 +180,7 @@ def tokenize_big_smile(big_smile): prev_node = anchor smile += token else: - if token not in '@ . - = # $ : / \\ + - %'\ + if token not in '] H @ . - = # $ : / \\ + - %'\ and not token.isdigit(): prev_node = node_count node_count += 1 From 097ec842efefb5d4e5ca8b32d38365174f9af10a Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Sun, 3 Mar 2024 14:54:11 +0100 Subject: [PATCH 15/25] account for explicit hydrogen in the smiles string input --- polyply/src/big_smile_parsing.py | 11 +++++++-- polyply/tests/test_big_smile_parsing.py | 30 +++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/polyply/src/big_smile_parsing.py b/polyply/src/big_smile_parsing.py index 6969a31c..55d3a6aa 100644 --- a/polyply/src/big_smile_parsing.py +++ b/polyply/src/big_smile_parsing.py @@ -204,8 +204,15 @@ def _rebuild_h_atoms(mol_graph): else: for node in mol_graph.nodes: if mol_graph.nodes[node].get('bonding', False): - hcount = mol_graph.nodes[node]['hcount'] - mol_graph.nodes[node]['hcount'] = hcount - len(mol_graph.nodes[node]['bonding']) + # get the degree + ele = mol_graph.nodes[0]['element'] + # hcoung is the valance minus the degree minus + # the number of bonding descriptors + hcount = pysmiles.smiles_helper.VALENCES[ele][0] -\ + mol_graph.degree(node) -\ + len(mol_graph.nodes[node]['bonding']) + + mol_graph.nodes[node]['hcount'] = hcount pysmiles.smiles_helper.add_explicit_hydrogens(mol_graph) return mol_graph diff --git a/polyply/tests/test_big_smile_parsing.py b/polyply/tests/test_big_smile_parsing.py index f7faf71a..ba3f5f69 100644 --- a/polyply/tests/test_big_smile_parsing.py +++ b/polyply/tests/test_big_smile_parsing.py @@ -53,6 +53,10 @@ def test_res_pattern_to_meta_mol(smile, nodes, edges): ("[$]COC[$]", "COC", {0: ["$"], 2: ["$"]}), + # simple symmetric but with explicit hydrogen + ("[$][CH2]O[CH2][$]", + "[CH2]O[CH2]", + {0: ["$"], 2: ["$"]}), # smiple symmetric bonding; multiple descript ("[$]COC[$][$1]", "COC", @@ -91,6 +95,17 @@ def test_tokenize_big_smile(big_smile, smile, bonding): (6, {"atomname": "H6", "resname": "PEO", "element": "H"}), )}, {"PEO": [(0, 1), (1, 2), (0, 3), (0, 4), (2, 5), (2, 6)]}), + # single fragment but with explicit hydrogen in smiles + ("{#PEO=[$][CH2]O[CH2][$]}", + {"PEO": ((0, {"atomname": "C0", "resname": "PEO", "bonding": ["$"], "element": "C"}), + (1, {"atomname": "O1", "resname": "PEO", "element": "O"}), + (2, {"atomname": "C2", "resname": "PEO", "bonding": ["$"], "element": "C"}), + (3, {"atomname": "H3", "resname": "PEO", "element": "H"}), + (4, {"atomname": "H4", "resname": "PEO", "element": "H"}), + (5, {"atomname": "H5", "resname": "PEO", "element": "H"}), + (6, {"atomname": "H6", "resname": "PEO", "element": "H"}), + )}, + {"PEO": [(0, 1), (1, 2), (0, 3), (0, 4), (2, 5), (2, 6)]}), # test NH3 terminal ("{#AMM=N[$]}", {"AMM": ((0, {"atomname": "N0", "resname": "AMM", "bonding": ["$"], "element": "N"}), @@ -126,6 +141,21 @@ def test_tokenize_big_smile(big_smile, smile, bonding): (1, {"atomname": "H1", "resname": "OHter", "element": "H"}))}, {"PEO": [(0, 1), (1, 2), (0, 3), (0, 4), (2, 5)], "OHter": [(0, 1)]}), + # single fragment + 1 terminal but multiple bond descritp. + # but explicit hydrogen in the smiles string + ("{#PEO=[$][CH2]O[CH2][$][$1],#OHter=[$][OH]}", + {"PEO": ((0, {"atomname": "C0", "resname": "PEO", "bonding": ["$"], "element": "C"}), + (1, {"atomname": "O1", "resname": "PEO", "element": "O"}), + (2, {"atomname": "C2", "resname": "PEO", "bonding": ["$", "$1"], "element": "C"}), + (3, {"atomname": "H3", "resname": "PEO", "element": "H"}), + (4, {"atomname": "H4", "resname": "PEO", "element": "H"}), + (5, {"atomname": "H5", "resname": "PEO", "element": "H"}), + ), + "OHter": ((0, {"atomname": "O0", "resname": "OHter", "bonding": ["$"], "element": "O"}), + (1, {"atomname": "H1", "resname": "OHter", "element": "H"}))}, + {"PEO": [(0, 1), (1, 2), (0, 3), (0, 4), (2, 5)], + "OHter": [(0, 1)]}), + )) def test_fragment_iter(fragment_str, nodes, edges): for resname, mol_graph in fragment_iter(fragment_str): From 514ba1b2408da1bf592845719b05baeb4dc61d12 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Sun, 3 Mar 2024 15:57:12 +0100 Subject: [PATCH 16/25] test accounting for explicit hydrogen in the smiles string input --- polyply/src/big_smile_mol_processor.py | 25 ++++++++++++++++++++---- polyply/tests/test_big_smile_mol_proc.py | 13 ++++++++++-- 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/polyply/src/big_smile_mol_processor.py b/polyply/src/big_smile_mol_processor.py index 8499e7e3..f474fe76 100644 --- a/polyply/src/big_smile_mol_processor.py +++ b/polyply/src/big_smile_mol_processor.py @@ -104,14 +104,31 @@ def edges_from_bonding_descrpt(self): node_graph.nodes[edge[1]]['bonding'] = node_bond_list self.meta_molecule.molecule.add_edge(edge[0], edge[1], bonding=bonding) + def replace_unconsumed_bonding_descrpt(self): + """ + We allow multiple bonding descriptors per atom, which + however, are not always consumed. In this case the left + over bonding descriptors are replaced by hydrogen atoms. + """ + for node in self.meta_molecule.nodes: + graph = self.meta_molecule.nodes[node]['graph'] + bonding = nx.get_node_attributes(graph, "bonding") + for node, bondings in bonding.items(): + attrs = {attr: graph.nodes[node][attr] for attr in ['resname', 'resid']} + attrs['element'] = 'H' + for new_id in range(1, len(bondings)+1): + new_node = len(self.meta_molecule.molecule.nodes) + 1 + graph.add_edge(node, new_node) + attrs['atomname'] = "H" + str(new_id + len(graph.nodes)) + graph.nodes[new_node].update(attrs) + self.meta_molecule.molecule.add_edge(node, new_node) + self.meta_molecule.molecule.nodes[new_node].update(attrs) + def parse(self, big_smile_str): res_pattern, residues = big_smile_str.split('.') self.meta_molecule = res_pattern_to_meta_mol(res_pattern) self.force_field = force_field_from_fragments(residues) MapToMolecule(self.force_field).run_molecule(self.meta_molecule) self.edges_from_bonding_descrpt() + self.replace_unconsumed_bonding_descrpt() return self.meta_molecule - -# ToDo -# - replace non consumed bonding descrpt by hydrogen -# - diff --git a/polyply/tests/test_big_smile_mol_proc.py b/polyply/tests/test_big_smile_mol_proc.py index 6975b885..26e85ba6 100644 --- a/polyply/tests/test_big_smile_mol_proc.py +++ b/polyply/tests/test_big_smile_mol_proc.py @@ -52,6 +52,15 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes): [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7), (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13), (11, 14), (11, 15), (11, 16), (16, 17)]), + # uncomsumed bonding IDs; note that this is not the same + # molecule as previous test case. Here one of the OH branches + # and replaces an CH2 group with CH-OH + ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[>][$1]COC[<],#OHter=[$1][O]}", + [('OHter', 'O H'), ('PEO', 'C O C H H H H'), + ('PEO', 'C O C H H H H'), ('OHter', 'O H')], + [(0, 1), (0, 2), (2, 3), (2, 5), (2, 10), (3, 4), + (4, 6), (4, 7), (4, 17), (8, 9), (8, 11), (8, 14), + (8, 18), (9, 10), (10, 12), (10, 13), (14, 15)]), # simple branched sequence ("{[#Hter][#PE]([#PEO][#Hter])[#PE]([#PEO][#Hter])[#Hter]}.{#Hter=[$]H,#PE=[$]CC[$][$],#PEO=[$]COC[$]}", [('Hter', 'H'), ('PE', 'C C H H H'), ('PEO', 'C O C H H H H'), ('Hter', 'H'), @@ -75,11 +84,11 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes): )) def test_def_big_smile_parser(smile, ref_nodes, ref_edges): meta_mol = DefBigSmileParser().parse(smile) +# nx.draw_networkx(meta_mol.molecule, with_labels=True, labels=nx.get_node_attributes(meta_mol.molecule, 'element')) +# plt.show() for node, ref in zip(meta_mol.nodes, ref_nodes): assert meta_mol.nodes[node]['resname'] == ref[0] block_graph = meta_mol.nodes[node]['graph'] elements = list(nx.get_node_attributes(block_graph, 'element').values()) assert elements == ref[1].split() - #nx.draw_networkx(meta_mol.molecule, with_labels=True, labels=nx.get_node_attributes(meta_mol.molecule, 'element')) - #plt.show() assert sorted(meta_mol.molecule.edges) == sorted(ref_edges) From d97632d57a427479f1a77fe380acc41df97ef3d1 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Mon, 4 Mar 2024 15:35:24 +0100 Subject: [PATCH 17/25] adjust doc string --- polyply/src/big_smile_mol_processor.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/polyply/src/big_smile_mol_processor.py b/polyply/src/big_smile_mol_processor.py index f474fe76..b533e818 100644 --- a/polyply/src/big_smile_mol_processor.py +++ b/polyply/src/big_smile_mol_processor.py @@ -83,9 +83,10 @@ def edges_from_bonding_descrpt(self): """ Make edges according to the bonding descriptors stored in the node attributes of meta_molecule residue graph. - If a bonding descriptor is consumed it is set to None, + If a bonding descriptor is consumed it is removed from the list, however, the meta_molecule edge gets an attribute with the - bonding descriptors that formed the edge. + bonding descriptors that formed the edge. Later uncomsumed + bonding descriptors are replaced by hydrogen atoms. """ for prev_node, node in nx.dfs_edges(self.meta_molecule): prev_graph = self.meta_molecule.nodes[prev_node]['graph'] From c4f16527532172f887689b86bc3659284ce33f97 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Wed, 6 Mar 2024 18:00:36 +0100 Subject: [PATCH 18/25] parse force-field in molprocessor, adjust hydrogen reconstruction --- polyply/src/big_smile_mol_processor.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/polyply/src/big_smile_mol_processor.py b/polyply/src/big_smile_mol_processor.py index b533e818..640c40e1 100644 --- a/polyply/src/big_smile_mol_processor.py +++ b/polyply/src/big_smile_mol_processor.py @@ -1,8 +1,12 @@ import networkx as nx +import pysmiles from polyply.src.big_smile_parsing import (res_pattern_to_meta_mol, force_field_from_fragments) from polyply.src.map_to_molecule import MapToMolecule +VALENCES = pysmiles.smiles_helper.VALENCES +VALENCES.update({"H":(1,)}) + def compatible(left, right): """ Check bonding descriptor compatibility according @@ -74,8 +78,8 @@ class DefBigSmileParser: which describes a polymer molecule. """ - def __init__(self): - self.force_field = None + def __init__(self, force_field): + self.force_field = force_field self.meta_molecule = None self.molecule = None @@ -115,9 +119,12 @@ def replace_unconsumed_bonding_descrpt(self): graph = self.meta_molecule.nodes[node]['graph'] bonding = nx.get_node_attributes(graph, "bonding") for node, bondings in bonding.items(): + element = graph.nodes[node]['element'] + hcount = VALENCES[element][0] -\ + self.meta_molecule.molecule.degree(node) + 1 attrs = {attr: graph.nodes[node][attr] for attr in ['resname', 'resid']} attrs['element'] = 'H' - for new_id in range(1, len(bondings)+1): + for new_id in range(1, hcount): new_node = len(self.meta_molecule.molecule.nodes) + 1 graph.add_edge(node, new_node) attrs['atomname'] = "H" + str(new_id + len(graph.nodes)) From 7a5dd1f74e76e8c15103a5ad93e9490d253403bf Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Wed, 6 Mar 2024 19:12:26 +0100 Subject: [PATCH 19/25] fix tests --- polyply/tests/test_big_smile_mol_proc.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/polyply/tests/test_big_smile_mol_proc.py b/polyply/tests/test_big_smile_mol_proc.py index 26e85ba6..28c5390d 100644 --- a/polyply/tests/test_big_smile_mol_proc.py +++ b/polyply/tests/test_big_smile_mol_proc.py @@ -1,5 +1,6 @@ import pytest import networkx as nx +from vermouth.forcefield import ForceField from polyply.src.big_smile_mol_processor import (DefBigSmileParser, generate_edge) #import matplotlib.pyplot as plt @@ -83,7 +84,8 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes): )) def test_def_big_smile_parser(smile, ref_nodes, ref_edges): - meta_mol = DefBigSmileParser().parse(smile) + ff = ForceField("new") + meta_mol = DefBigSmileParser(ff).parse(smile) # nx.draw_networkx(meta_mol.molecule, with_labels=True, labels=nx.get_node_attributes(meta_mol.molecule, 'element')) # plt.show() for node, ref in zip(meta_mol.nodes, ref_nodes): From 2b9e7a9cdc414a3961efb802e667db74a7572a56 Mon Sep 17 00:00:00 2001 From: "Dr. Fabian Grunewald" <32294573+fgrunewald@users.noreply.github.com> Date: Wed, 6 Mar 2024 19:16:53 +0100 Subject: [PATCH 20/25] Apply suggestions from code review Co-authored-by: Peter C Kroon --- polyply/src/big_smile_mol_processor.py | 9 +++------ polyply/src/big_smile_parsing.py | 16 ++++++++-------- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/polyply/src/big_smile_mol_processor.py b/polyply/src/big_smile_mol_processor.py index 640c40e1..365b61bc 100644 --- a/polyply/src/big_smile_mol_processor.py +++ b/polyply/src/big_smile_mol_processor.py @@ -23,12 +23,9 @@ def compatible(left, right): """ if left == right and left not in '> <': return True - if left[0] == "<" and right[0] == ">": - if left[1:] == right[1:]: - return True - if left[0] == ">" and right[0] == "<": - if left[1:] == right[1:]: - return True + l, r = left[0], right[0] + if (l, r) == ('<', '>') or (l, r) == ('>', '<'): + return left[1:] == right[1:] return False def generate_edge(source, target, bond_type="bonding"): diff --git a/polyply/src/big_smile_parsing.py b/polyply/src/big_smile_parsing.py index 55d3a6aa..90e171a3 100644 --- a/polyply/src/big_smile_parsing.py +++ b/polyply/src/big_smile_parsing.py @@ -3,10 +3,10 @@ import numpy as np try: import pysmiles -except ImportError: +except ImportError as error: msg = ("You are using a functionality that requires " "the pysmiles package. Use pip install pysmiles ") - raise ImportError(msg) + raise ImportError(msg) from error import networkx as nx from vermouth.forcefield import ForceField from vermouth.molecule import Block @@ -41,7 +41,7 @@ def res_pattern_to_meta_mol(pattern): '{' + [#resname_1][#resname_2]... + '}' In addition to plain enumeration any residue may be - followed by a '|' and an integern number that + followed by a '|' and an integer number that specifies how many times the given residue should be added within a sequence. For example, a pentamer of PEO can be written as: @@ -52,10 +52,10 @@ def res_pattern_to_meta_mol(pattern): {[#PEO]|5} - The block syntax also applies to branches. Here the convetion + The block syntax also applies to branches. Here the convention is that the complete branch including it's first anchoring residue is repeated. For example, to generate a PMA-g-PEG - polymer the following syntax is permitted: + polymer containing 15 residues the following syntax is permitted: {[#PMA]([#PEO][#PEO])|5} @@ -79,7 +79,7 @@ def res_pattern_to_meta_mol(pattern): if pattern[start-1] == '(': branching = True branch_anchor = prev_node - recipie = [(meta_mol.nodes[prev_node]['resname'], 1)] + recipe = [(meta_mol.nodes[prev_node]['resname'], 1)] if stop < len(pattern) and pattern[stop] == '|': eon = _find_next_character(pattern, ['[', ')', '(', '}'], stop) n_mon = int(pattern[stop+1:eon]) @@ -89,7 +89,7 @@ def res_pattern_to_meta_mol(pattern): resname = match.group(0)[2:-1] # collect all residues in branch if branching: - recipie.append((resname, n_mon)) + recipe.append((resname, n_mon)) # add the new residue connection = [] @@ -135,7 +135,7 @@ def tokenize_big_smile(big_smile): """ Processes a BigSmile string by storing the the BigSmile specific bonding descriptors - in a dict with refernce to the atom they + in a dict with reference to the atom they refer to. Furthermore, a cleaned smile string is generated with the BigSmile specific syntax removed. From b6d891f6f32bbc60ed96fd30d75953f717d21117 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Thu, 7 Mar 2024 14:24:47 +0100 Subject: [PATCH 21/25] allow nested branch expansion --- polyply/src/big_smile_parsing.py | 81 ++++++++++++++++++++++++-------- 1 file changed, 62 insertions(+), 19 deletions(-) diff --git a/polyply/src/big_smile_parsing.py b/polyply/src/big_smile_parsing.py index 90e171a3..e396a5e3 100644 --- a/polyply/src/big_smile_parsing.py +++ b/polyply/src/big_smile_parsing.py @@ -24,6 +24,22 @@ def _find_next_character(string, chars, start): return idx+start return np.inf +def _expand_branch(meta_mol, current, anchor, recipe): + prev_node = anchor + for bdx, (resname, n_mon) in enumerate(recipe): + if bdx == 0: + anchor = current + for _ in range(0, n_mon): + connection = [(prev_node, current)] + print(connection) + meta_mol.add_monomer(current, + resname, + connection) + prev_node = current + current += 1 + prev_node = anchor + return meta_mol, current, prev_node + def res_pattern_to_meta_mol(pattern): """ Generate a :class:`polyply.MetaMolecule` from a @@ -70,17 +86,30 @@ def res_pattern_to_meta_mol(pattern): """ meta_mol = MetaMolecule() current = 0 - branch_anchor = 0 + # stores one or more branch anchors; each next + # anchor belongs to a nested branch + branch_anchor = [] + # used for storing composition protocol for + # for branches; each entry is a list of + # branches from extending from the anchor + # point + recipes = defaultdict(list) + # the previous node prev_node = None + # do we have an open branch branching = False for match in re.finditer(PATTERNS['place_holder'], pattern): start, stop = match.span() # new branch here if pattern[start-1] == '(': branching = True - branch_anchor = prev_node - recipe = [(meta_mol.nodes[prev_node]['resname'], 1)] + branch_anchor.append(prev_node) + # the recipe for making the branch includes the anchor; which + # is hence the first atom in the list + if len(branch_anchor) == 1: + recipes[branch_anchor[-1]] = [(meta_mol.nodes[prev_node]['resname'], 1)] if stop < len(pattern) and pattern[stop] == '|': + # eon => end of next eon = _find_next_character(pattern, ['[', ')', '(', '}'], stop) n_mon = int(pattern[stop+1:eon]) else: @@ -89,7 +118,7 @@ def res_pattern_to_meta_mol(pattern): resname = match.group(0)[2:-1] # collect all residues in branch if branching: - recipe.append((resname, n_mon)) + recipes[branch_anchor[-1]].append((resname, n_mon)) # add the new residue connection = [] @@ -105,26 +134,40 @@ def res_pattern_to_meta_mol(pattern): # terminate branch and jump back to anchor branch_stop = _find_next_character(pattern, ['['], stop) >\ _find_next_character(pattern, [')'], stop) - if stop <= len(pattern) and branch_stop and branching: + + if stop <= len(pattern) and branch_stop: # and branching: branching = False - prev_node = branch_anchor + prev_node = branch_anchor.pop() + if branch_anchor: + branching = True # we have to multiply the branch n-times eon_a = _find_next_character(pattern, [')'], stop) if stop+1 < len(pattern) and pattern[eon_a+1] == "|": eon_b = _find_next_character(pattern, ['[', ')', '(', '}'], eon_a+1) - # -1 because one branch has already been added at this point - for _ in range(0,int(pattern[eon_a+2:eon_b])-1): - for bdx, (resname, n_mon) in enumerate(recipie): - if bdx == 0: - anchor = current - for _ in range(0, n_mon): - connection = [(prev_node, current)] - meta_mol.add_monomer(current, - resname, - connection) - prev_node = current - current += 1 - prev_node = anchor + # the outermost loop goes over how often a the branch has to be + # added to the existing sequence + for idx in range(0,int(pattern[eon_a+2:eon_b])-1): + prev_anchor = None + skip = 0 + for ref_anchor, recipe in list(recipes.items())[len(branch_anchor):]: + print("-->", recipe) + if prev_anchor: + offset = ref_anchor - prev_anchor + prev_node = prev_node + offset + #skip = 1 + print(prev_node) + meta_mol, current, prev_node = _expand_branch(meta_mol, + current=current, + anchor=prev_node, + recipe=recipe) #[skip:]) + if prev_anchor is None: + base_anchor = prev_node + prev_anchor = ref_anchor + print(base_anchor) + prev_node = base_anchor + # if all branches are done we need to reset the lists + # branch_anchor = [] + # recipes = defaultdict(list) return meta_mol def _big_smile_iter(smile): From a867329a82d2f4988e43839b84357032841534a4 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Thu, 7 Mar 2024 14:56:16 +0100 Subject: [PATCH 22/25] test branch expansion --- polyply/src/big_smile_parsing.py | 17 +++--- polyply/tests/test_big_smile_parsing.py | 69 +++++++++++++++++++++++-- 2 files changed, 73 insertions(+), 13 deletions(-) diff --git a/polyply/src/big_smile_parsing.py b/polyply/src/big_smile_parsing.py index e396a5e3..d93265ec 100644 --- a/polyply/src/big_smile_parsing.py +++ b/polyply/src/big_smile_parsing.py @@ -31,7 +31,6 @@ def _expand_branch(meta_mol, current, anchor, recipe): anchor = current for _ in range(0, n_mon): connection = [(prev_node, current)] - print(connection) meta_mol.add_monomer(current, resname, connection) @@ -106,8 +105,8 @@ def res_pattern_to_meta_mol(pattern): branch_anchor.append(prev_node) # the recipe for making the branch includes the anchor; which # is hence the first atom in the list - if len(branch_anchor) == 1: - recipes[branch_anchor[-1]] = [(meta_mol.nodes[prev_node]['resname'], 1)] + #if len(branch_anchor) == 1: + recipes[branch_anchor[-1]] = [(meta_mol.nodes[prev_node]['resname'], 1)] if stop < len(pattern) and pattern[stop] == '|': # eon => end of next eon = _find_next_character(pattern, ['[', ')', '(', '}'], stop) @@ -150,24 +149,22 @@ def res_pattern_to_meta_mol(pattern): prev_anchor = None skip = 0 for ref_anchor, recipe in list(recipes.items())[len(branch_anchor):]: - print("-->", recipe) if prev_anchor: offset = ref_anchor - prev_anchor prev_node = prev_node + offset - #skip = 1 - print(prev_node) + skip = 1 meta_mol, current, prev_node = _expand_branch(meta_mol, current=current, anchor=prev_node, - recipe=recipe) #[skip:]) + recipe=recipe[skip:]) if prev_anchor is None: base_anchor = prev_node prev_anchor = ref_anchor - print(base_anchor) prev_node = base_anchor # if all branches are done we need to reset the lists - # branch_anchor = [] - # recipes = defaultdict(list) + # when all nested branches are completed + if len(branch_anchor) == 0: + recipes = defaultdict(list) return meta_mol def _big_smile_iter(smile): diff --git a/polyply/tests/test_big_smile_parsing.py b/polyply/tests/test_big_smile_parsing.py index ba3f5f69..5c1491b8 100644 --- a/polyply/tests/test_big_smile_parsing.py +++ b/polyply/tests/test_big_smile_parsing.py @@ -22,11 +22,13 @@ ["PMA", "PMA", "PMA"], [(0, 1), (1, 2)]), # simple branch expension - ("{[#PMA]([#PEO][#PEO][#OHter])|2}", + ("{[#PMA]([#PEO][#PEO][#OHter])|3}", ["PMA", "PEO", "PEO", "OHter", + "PMA", "PEO", "PEO", "OHter", "PMA", "PEO", "PEO", "OHter"], [(0, 1), (1, 2), (2, 3), - (0, 4), (4, 5), (5, 6), (6, 7)] + (0, 4), (4, 5), (5, 6), (6, 7), + (4, 8), (8, 9), (9, 10), (10, 11)] ), # nested branched with expansion ("{[#PMA]([#PEO]|3)|2}", @@ -34,7 +36,68 @@ "PMA", "PEO", "PEO", "PEO"], [(0, 1), (1, 2), (2, 3), (0, 4), (4, 5), (5, 6), (6, 7)] - ) + ), + # nested braching + # 0 1 2 3 4 5 6 + ("{[#PMA][#PMA]([#PEO][#PEO]([#OH])[#PEO])[#PMA]}", + ["PMA", "PMA", "PEO", "PEO", "OH", + "PEO", "PMA"], + [(0, 1), (1, 2), (2, 3), + (3, 4), (3, 5), (1, 6)] + ), + # nested braching plus expansion + # 0 1 2 3 4/5 6 7 + ("{[#PMA][#PMA]([#PEO][#PEO]([#OH]|2)[#PEO])[#PMA]}", + ["PMA", "PMA", "PEO", "PEO", "OH", "OH", + "PEO", "PMA"], + [(0, 1), (1, 2), (2, 3), + (3, 4), (4, 5), (3, 6), (1, 7)] + ), + # nested braching plus expansion incl. branch + # 0 1 2 3 4 5 + # 6 7 8 9 10 11 + ("{[#PMA][#PMA]([#PEO][#PEO]([#OH])[#PEO])|2[#PMA]}", + ["PMA", "PMA", "PEO", "PEO", "OH", "PEO", + "PMA", "PEO", "PEO", "PEO", "OH", "PMA"], + [(0, 1), (1, 2), (2, 3), + (3, 4), (3, 5), (1, 6), (6, 7), (7, 8), + (8, 9), (8, 10), (6, 11)] + ), + # nested braching plus expansion of nested branch + # here the nested branch is expended + # 0 - 1 - 10 + # | + # 2 + # | + # 3 {- 5 - 7 } - 9 -> the expanded fragment + # | | | + # 4 6 8 + ("{[#PMA][#PMA]([#PEO][#PQ]([#OH])|3[#PEO])[#PMA]}", + ["PMA", "PMA", "PEO", "PQ", "OH", + "PQ", "OH", "PQ", "OH", "PEO", "PMA"], + [(0, 1), (1, 2), (1, 10), + (2, 3), (3, 4), (3, 5), (5, 6), + (5, 7), (7, 8), (7, 9)] + ), + # nested braching plus expansion of nested branch + # here the nested branch is expended and a complete + # new branch is added + # 11 13 + # | | + # 0 - 1 - 10 - 12 + # | + # 2 + # | + # 3 {- 5 - 7 } - 9 -> the expanded fragment + # | | | + # 4 6 8 + ("{[#PMA][#PMA]([#PEO][#PQ]([#OH])|3[#PEO])[#PMA]([#CH3])|2}", + ["PMA", "PMA", "PEO", "PQ", "OH", + "PQ", "OH", "PQ", "OH", "PEO", "PMA", "CH3", "PMA", "CH3"], + [(0, 1), (1, 2), (1, 10), + (2, 3), (3, 4), (3, 5), (5, 6), + (5, 7), (7, 8), (7, 9), (10, 11), (10, 12), (12, 13)] + ), )) def test_res_pattern_to_meta_mol(smile, nodes, edges): """ From b6f5cc0d4a101948ca853cf1226afb598f6b96f3 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Thu, 7 Mar 2024 15:19:09 +0100 Subject: [PATCH 23/25] add comments all over residue expansion functions --- polyply/src/big_smile_parsing.py | 62 +++++++++++++++++++++++++++----- 1 file changed, 54 insertions(+), 8 deletions(-) diff --git a/polyply/src/big_smile_parsing.py b/polyply/src/big_smile_parsing.py index d93265ec..b043ebd9 100644 --- a/polyply/src/big_smile_parsing.py +++ b/polyply/src/big_smile_parsing.py @@ -97,29 +97,42 @@ def res_pattern_to_meta_mol(pattern): prev_node = None # do we have an open branch branching = False + # each element in the for loop matches a pattern + # '[' + '#' + some alphanumeric name + ']' for match in re.finditer(PATTERNS['place_holder'], pattern): start, stop = match.span() - # new branch here + # we start a new branch when the residue is preceded by '(' + # as in ... ([#PEO] ... if pattern[start-1] == '(': branching = True branch_anchor.append(prev_node) # the recipe for making the branch includes the anchor; which - # is hence the first atom in the list - #if len(branch_anchor) == 1: + # is hence the first residue in the list recipes[branch_anchor[-1]] = [(meta_mol.nodes[prev_node]['resname'], 1)] + # here we check if the atom is followed by a expansion character '|' + # as in ... [#PEO]| if stop < len(pattern) and pattern[stop] == '|': # eon => end of next + # we find the next character that starts a new residue, ends a branch or + # ends the complete pattern eon = _find_next_character(pattern, ['[', ')', '(', '}'], stop) + # between the expansion character and the eon character + # is any number that correspnds to the number of times (i.e. monomers) + # that this atom should be added n_mon = int(pattern[stop+1:eon]) else: n_mon = 1 + # the resname starts at the second character and ends + # one before the last according to the above pattern resname = match.group(0)[2:-1] - # collect all residues in branch + # if this residue is part of a branch we store it in + # the recipe dict together with the anchor residue + # and expansion number if branching: recipes[branch_anchor[-1]].append((resname, n_mon)) - # add the new residue + # new we add new residue as often as required connection = [] for _ in range(0, n_mon): if prev_node is not None: @@ -130,36 +143,69 @@ def res_pattern_to_meta_mol(pattern): prev_node = current current += 1 - # terminate branch and jump back to anchor + # here we check if the residue considered before is the + # last residue of a branch (i.e. '...[#residue])' + # that is the case if the branch closure comes before + # any new atom begins branch_stop = _find_next_character(pattern, ['['], stop) >\ _find_next_character(pattern, [')'], stop) - if stop <= len(pattern) and branch_stop: # and branching: + # if the branch ends we reset the anchor + # and set branching False unless we are in + # a nested branch + if stop <= len(pattern) and branch_stop: branching = False prev_node = branch_anchor.pop() if branch_anchor: branching = True - # we have to multiply the branch n-times + #======================================== + # expansion for branches + #======================================== + # We need to know how often the branch has + # to be added so we first identify the branch + # terminal character ')' called eon_a. eon_a = _find_next_character(pattern, [')'], stop) + # Then we check if the expansion character + # is next. if stop+1 < len(pattern) and pattern[eon_a+1] == "|": + # If there is one we find the beginning + # of the next branch, residue or end of the string + # As before all characters inbetween are a number that + # is how often the branch is expanded. eon_b = _find_next_character(pattern, ['[', ')', '(', '}'], eon_a+1) # the outermost loop goes over how often a the branch has to be # added to the existing sequence for idx in range(0,int(pattern[eon_a+2:eon_b])-1): prev_anchor = None skip = 0 + # in principle each branch can contain any number of nested branches + # each branch is itself a recipe that has an anchor atom for ref_anchor, recipe in list(recipes.items())[len(branch_anchor):]: + # starting from the first nested branch we have to do some + # math to find the anchor atom relative to the first branch + # we also skip the first residue in recipe, which is the + # anchor residue. Only the outermost branch in an expansion + # is expanded including the anchor. This allows easy description + # of graft polymers. if prev_anchor: offset = ref_anchor - prev_anchor prev_node = prev_node + offset skip = 1 + # this function simply adds the residues of the paticular + # branch meta_mol, current, prev_node = _expand_branch(meta_mol, current=current, anchor=prev_node, recipe=recipe[skip:]) + # if this is the first branch we want to set the anchor + # as the base anchor to which we jump back after all nested + # branches have been added if prev_anchor is None: base_anchor = prev_node + # store the previous anchor so we can do the math for nested + # branches prev_anchor = ref_anchor + # all branches added; then go back to the base anchor prev_node = base_anchor # if all branches are done we need to reset the lists # when all nested branches are completed From f965e1d42c6b3afb41b690407c36aeee0a4e8493 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Thu, 7 Mar 2024 15:33:45 +0100 Subject: [PATCH 24/25] address comments --- polyply/src/big_smile_mol_processor.py | 8 ++++---- polyply/src/big_smile_parsing.py | 12 ++++-------- polyply/tests/test_big_smile_mol_proc.py | 2 +- 3 files changed, 9 insertions(+), 13 deletions(-) diff --git a/polyply/src/big_smile_mol_processor.py b/polyply/src/big_smile_mol_processor.py index 365b61bc..e706217a 100644 --- a/polyply/src/big_smile_mol_processor.py +++ b/polyply/src/big_smile_mol_processor.py @@ -28,7 +28,7 @@ def compatible(left, right): return left[1:] == right[1:] return False -def generate_edge(source, target, bond_type="bonding"): +def generate_edge(source, target, bond_attribute="bonding"): """ Given a source and a target graph, which have bonding descriptors stored as node attributes, find a pair of @@ -41,7 +41,7 @@ def generate_edge(source, target, bond_type="bonding"): ---------- source: :class:`nx.Graph` target: :class:`nx.Graph` - bond_type: `abc.hashable` + bond_attribute: `abc.hashable` under which attribute are the bonding descriptors stored. @@ -55,8 +55,8 @@ def generate_edge(source, target, bond_type="bonding"): LookupError if no match is found """ - source_nodes = nx.get_node_attributes(source, bond_type) - target_nodes = nx.get_node_attributes(target, bond_type) + source_nodes = nx.get_node_attributes(source, bond_attribute) + target_nodes = nx.get_node_attributes(target, bond_attribute) for source_node in source_nodes: for target_node in target_nodes: #print(source_node, target_node) diff --git a/polyply/src/big_smile_parsing.py b/polyply/src/big_smile_parsing.py index b043ebd9..d591eecd 100644 --- a/polyply/src/big_smile_parsing.py +++ b/polyply/src/big_smile_parsing.py @@ -213,10 +213,6 @@ def res_pattern_to_meta_mol(pattern): recipes = defaultdict(list) return meta_mol -def _big_smile_iter(smile): - for token in smile: - yield token - def tokenize_big_smile(big_smile): """ Processes a BigSmile string by storing the @@ -229,17 +225,17 @@ def tokenize_big_smile(big_smile): Parameters ---------- smile: str - a BigSmile smile string + a BigSmile smiles string Returns ------- str - a canonical smile string + a canonical smiles string dict a dict mapping bonding descriptors - to the nodes within the smile + to the nodes within the smiles string """ - smile_iter = _big_smile_iter(big_smile) + smile_iter = iter(big_smile) bonding_descrpt = defaultdict(list) smile = "" node_count = 0 diff --git a/polyply/tests/test_big_smile_mol_proc.py b/polyply/tests/test_big_smile_mol_proc.py index 28c5390d..c40f96bd 100644 --- a/polyply/tests/test_big_smile_mol_proc.py +++ b/polyply/tests/test_big_smile_mol_proc.py @@ -38,7 +38,7 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes): target = nx.path_graph(4) nx.set_node_attributes(source, bonds_source, "bonding") nx.set_node_attributes(target, bonds_target, "bonding") - new_edge, new_btypes = generate_edge(source, target, bond_type="bonding") + new_edge, new_btypes = generate_edge(source, target, bond_attribute="bonding") assert new_edge == edge assert new_btypes == btypes From 0335956072b84fcc8d59f2d9b6264917ce971879 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Thu, 7 Mar 2024 16:00:29 +0100 Subject: [PATCH 25/25] allow for ionic bonds with . syntax --- polyply/src/big_smile_mol_processor.py | 6 +++++- polyply/src/big_smile_parsing.py | 4 ++++ polyply/tests/test_big_smile_mol_proc.py | 10 ++++++++++ 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/polyply/src/big_smile_mol_processor.py b/polyply/src/big_smile_mol_processor.py index e706217a..1801a437 100644 --- a/polyply/src/big_smile_mol_processor.py +++ b/polyply/src/big_smile_mol_processor.py @@ -1,3 +1,4 @@ +import re import networkx as nx import pysmiles from polyply.src.big_smile_parsing import (res_pattern_to_meta_mol, @@ -130,10 +131,13 @@ def replace_unconsumed_bonding_descrpt(self): self.meta_molecule.molecule.nodes[new_node].update(attrs) def parse(self, big_smile_str): - res_pattern, residues = big_smile_str.split('.') + res_pattern, residues = re.findall(r"\{[^\}]+\}", big_smile_str) self.meta_molecule = res_pattern_to_meta_mol(res_pattern) self.force_field = force_field_from_fragments(residues) MapToMolecule(self.force_field).run_molecule(self.meta_molecule) self.edges_from_bonding_descrpt() self.replace_unconsumed_bonding_descrpt() return self.meta_molecule + +# ToDo +# - clean copying of bond-list attributes L100 diff --git a/polyply/src/big_smile_parsing.py b/polyply/src/big_smile_parsing.py index d591eecd..ec136bea 100644 --- a/polyply/src/big_smile_parsing.py +++ b/polyply/src/big_smile_parsing.py @@ -361,3 +361,7 @@ def force_field_from_fragments(fragment_str): mol_block = Block(mol_graph) force_field.blocks[resname] = mol_block return force_field + +# ToDos +# - remove special case hydrogen line 327ff +# - check rebuild_h and clean up diff --git a/polyply/tests/test_big_smile_mol_proc.py b/polyply/tests/test_big_smile_mol_proc.py index c40f96bd..b6fe8e03 100644 --- a/polyply/tests/test_big_smile_mol_proc.py +++ b/polyply/tests/test_big_smile_mol_proc.py @@ -53,6 +53,16 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes): [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7), (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13), (11, 14), (11, 15), (11, 16), (16, 17)]), + # smiple linear seqeunce with ionic bond + ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]COC[$],#OHter=[$][O].[Na+]}", + # 0 1 2 3 4 5 6 7 8 + [('OHter', 'O Na'), ('PEO', 'C O C H H H H'), + # 9 10 11 12 13 14 15 16 17 + ('PEO', 'C O C H H H H'), ('OHter', 'O Na')], + [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7), + (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13), + (11, 14), (11, 15), (11, 16), (16, 17)]), + # uncomsumed bonding IDs; note that this is not the same # molecule as previous test case. Here one of the OH branches # and replaces an CH2 group with CH-OH