From 40b89af2fe29eda0c1d61123cdd4b2eb20318eb3 Mon Sep 17 00:00:00 2001
From: Fabian Gruenewald <f.grunewald@rug.nl>
Date: Fri, 19 Jan 2024 10:44:59 +0100
Subject: [PATCH 01/25] infrastructure for big smile parsing

---
 polyply/src/big_smile_parsing.py | 222 +++++++++++++++++++++++++++++++
 1 file changed, 222 insertions(+)
 create mode 100644 polyply/src/big_smile_parsing.py

diff --git a/polyply/src/big_smile_parsing.py b/polyply/src/big_smile_parsing.py
new file mode 100644
index 00000000..72e504e6
--- /dev/null
+++ b/polyply/src/big_smile_parsing.py
@@ -0,0 +1,222 @@
+import re
+import pysmiles
+import networkx as nx
+from vermouth.forcefield import ForceField
+from vermouth.molecule import Block
+from polyply.src.meta_molecule import MetaMolecule
+
+PATTERNS = {"bond_anchor": "\[\$.*?\]",
+            "place_holder": "\[\#.*?\]",
+            "annotation": "\|.*?\|",
+            "fragment": r'#(\w+)=((?:\[.*?\]|[^,\[\]]+)*)',
+            "seq_pattern": r'\{([^}]*)\}(?:\.\{([^}]*)\})?'}
+
+def res_pattern_to_meta_mol(pattern):
+    """
+    Generate a :class:`polyply.MetaMolecule` from a
+    pattern string describing a residue graph with the
+    simplified big-smile syntax.
+
+    The syntax scheme consists of two curly braces
+    enclosing the residue graph sequence. It can contain
+    any enumeration of residues by writing them as if they
+    were smile atoms but the atomname is given by # + resname.
+    This input fomat can handle branching as well ,however,
+    macrocycles are currently not supported.
+
+    General Pattern
+    '{' + [#resname_1][#resname_2]... + '}'
+
+    In addition to plain enumeration any residue may be
+    followed by a '|' and an integern number that
+    specifies how many times the given residue should
+    be added within a sequence. For example, a pentamer
+    of PEO can be written as:
+
+    {[#PEO][#PEO][#PEO][#PEO][#PEO]}
+
+    or
+
+    {[#PEO]|5}
+
+    The block syntax also applies to branches. Here the convetion
+    is that the complete branch including it's first anchoring
+    residue is repeated. For example, to generate a PMA-g-PEG
+    polymer the following syntax is permitted:
+
+    {[#PMA]([#PEO][#PEO])|5}
+
+    Parameters
+    ----------
+    pattern: str
+        a string describing the meta-molecule
+
+    Returns
+    -------
+    :class:`polyply.MetaMolecule`
+    """
+    meta_mol = MetaMolecule()
+    current = 0
+    branch_anchor = 0
+    prev_node = None
+    branching = False
+    for match in re.finditer(PATTERNS['place_holder'], pattern):
+        start, stop = match.span()
+        # new branch here
+        if pattern[start-1] == '(':
+            branching = True
+            branch_anchor = prev_node
+            recipie = [(meta_mol.nodes[prev_node]['resname'], 1)]
+        if stop < len(pattern) and pattern[stop] == '|':
+            n_mon = int(pattern[stop+1:pattern.find('[', stop)])
+        else:
+            n_mon = 1
+
+        resname = match.group(0)[2:-1]
+        # collect all residues in branch
+        if branching:
+            recipie.append((resname, n_mon))
+
+        # add the new residue
+        connection = []
+        for _ in range(0, n_mon):
+            if prev_node is not None:
+                connection = [(prev_node, current)]
+            meta_mol.add_monomer(current,
+                                 resname,
+                                 connection)
+            prev_node = current
+            current += 1
+
+        # terminate branch and jump back to anchor
+        if stop < len(pattern) and pattern[stop] == ')' and branching:
+            branching = False
+            prev_node = branch_anchor
+            # we have to multiply the branch n-times
+            if stop+1 < len(pattern) and pattern[stop+1] == "|":
+                for _ in range(0,int(pattern[stop+2:pattern.find('[', stop)])):
+                    for bdx, (resname, n_mon) in enumerate(recipie):
+                        if bdx == 0:
+                            anchor = current
+                        for _ in range(0, n_mon):
+                            connection = [(prev_node, current)]
+                            meta_mol.add_monomer(current,
+                                                 resname,
+                                                 connection)
+                            prev_node = current
+                            current += 1
+                    prev_node = anchor
+    return meta_mol
+
+def _big_smile_iter(smile):
+    for token in smile:
+        yield token
+
+def tokenize_big_smile(big_smile):
+    """
+    Processes a BigSmile string by storing the
+    the BigSmile specific bonding descriptors
+    in a dict with refernce to the atom they
+    refer to. Furthermore, a cleaned smile
+    string is generated with the BigSmile
+    specific syntax removed.
+
+    Parameters
+    ----------
+    smile: str
+        a BigSmile smile string
+
+    Returns
+    -------
+    str
+        a canonical smile string
+    dict
+        a dict mapping bonding descriptors
+        to the nodes within the smile
+    """
+    smile_iter = _big_smile_iter(big_smile)
+    bonding_descrpt = {}
+    smile = ""
+    node_count = 0
+    prev_node = 0
+    for token in smile_iter:
+        if token == '[':
+            peek = next(smile_iter)
+            if peek in ['$', '>', '<']:
+                bond_descrp = peek
+                peek = next(smile_iter)
+                while peek != ']':
+                    bond_descrp += peek
+                    peek = next(smile_iter)
+                bonding_descrpt[prev_node] = bond_descrp
+            else:
+                smile = smile + token + peek
+                prev_node = node_count
+                node_count += 1
+
+        elif token == '(':
+            anchor = prev_node
+            smile += token
+        elif token == ')':
+            prev_node = anchor
+            smile += token
+        else:
+            if token not in '@ . - = # $ : / \\ + - %':
+                prev_node = node_count
+                node_count += 1
+            smile += token
+    return smile, bonding_descrpt
+
+def fragment_iter(fragment_str):
+    """
+    Iterates over fragments defined in a BigSmile string.
+    Fragments are named residues that consist of a single
+    smile string together with the BigSmile specific bonding
+    descriptors. The function returns the resname of a named
+    fragment as well as a plain nx.Graph of the molecule
+    described by the smile. Bonding descriptors are annotated
+    as node attributes with the keyword bonding.
+
+    Parameters
+    ----------
+    fragment_str: str
+        the string describing the fragments
+
+    Yields
+    ------
+    str, nx.Graph
+    """
+    for fragment in fragment_str[1:-1].split(','):
+        delim = fragment.find('=', 0)
+        resname = fragment[1:delim]
+        big_smile = fragment[delim+1:]
+        smile, bonding_descrpt = tokenize_big_smile(big_smile)
+        mol_graph = pysmiles.read_smiles(smile)
+        atomnames = [str(node[0])+node[1]['element'] for node in mol_graph.nodes(data=True) ]
+        nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding')
+        nx.set_node_attributes(mol_graph, atomnames, 'atomname')
+        nx.set_node_attributes(mol_graph, resname, 'resname')
+        yield resname, mol_graph
+
+def force_field_from_fragments(fragment_str):
+    """
+    Collects the fragments defined in a BigSmile string
+    as :class:`vermouth.molecule.Blocks` in a force-field
+    object. Bonding descriptors are annotated as node
+    attribtues.
+
+    Parameters
+    ----------
+    fragment_str: str
+        string using BigSmile fragment syntax
+
+    Returns
+    -------
+    :class:`vermouth.forcefield.ForceField`
+    """
+    force_field = ForceField("big_smile_ff")
+    frag_iter = fragment_iter(fragment_str)
+    for resname, mol_graph in frag_iter:
+        mol_block = Block(mol_graph)
+        force_field.blocks[resname] = mol_block
+    return forxe_field

From 05ed0456919ad1865925f45b07d2e95396053734 Mon Sep 17 00:00:00 2001
From: Fabian Gruenewald <f.grunewald@rug.nl>
Date: Fri, 19 Jan 2024 10:47:06 +0100
Subject: [PATCH 02/25] optional dep. for pysmiles

---
 polyply/src/big_smile_parsing.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/polyply/src/big_smile_parsing.py b/polyply/src/big_smile_parsing.py
index 72e504e6..2ad65a7b 100644
--- a/polyply/src/big_smile_parsing.py
+++ b/polyply/src/big_smile_parsing.py
@@ -1,5 +1,10 @@
 import re
-import pysmiles
+try:
+    import pysmiles
+except ImportError:
+    msg = ("You are using a functionality that requires "
+           "the pysmiles package. Use pip install pysmiles ")
+    raise ImportError(msg)
 import networkx as nx
 from vermouth.forcefield import ForceField
 from vermouth.molecule import Block

From 82a2acc2bc0aa6a3e47eb9f51e4d4db06f4f0dbe Mon Sep 17 00:00:00 2001
From: Fabian Gruenewald <f.grunewald@rug.nl>
Date: Fri, 19 Jan 2024 10:50:13 +0100
Subject: [PATCH 03/25] add a processor that reads a big smile string and
 returns a full metamolecule including edges.

---
 polyply/src/big_smile_mol_processsor.py | 99 +++++++++++++++++++++++++
 1 file changed, 99 insertions(+)
 create mode 100644 polyply/src/big_smile_mol_processsor.py

diff --git a/polyply/src/big_smile_mol_processsor.py b/polyply/src/big_smile_mol_processsor.py
new file mode 100644
index 00000000..8131e009
--- /dev/null
+++ b/polyply/src/big_smile_mol_processsor.py
@@ -0,0 +1,99 @@
+import networkx as nx
+from polyply.src.big_smile_parsing import (res_pattern_to_meta_mol,
+                                           force_field_from_fragments)
+from polyply.src.map_to_molecule import MapToMolecule
+
+def compatible(left, right):
+    """
+    Check bonding descriptor compatibility according
+    to the BigSmiles syntax convetions.
+
+    Parameters
+    ----------
+    left: str
+    right: str
+
+    Returns
+    -------
+    bool
+    """
+    if left == right:
+        return True
+    if left[0] == "<" and right[0] == ">":
+        if left[1:] == right[1:]:
+            return True
+    if left[0] == ">" and right[0] == "<":
+        if left[1:] == right[1:]:
+            return True
+    return False
+
+def generate_edge(source, target, bond_type="bonding"):
+    """
+    Given a source and a target graph, which have bonding
+    descriptors stored as node attributes, find a pair of
+    matching descriptors and return the respective nodes.
+    The function also returns the bonding descriptors. If
+    no bonding descriptor is found an instance of LookupError
+    is raised.
+
+    Parameters
+    ----------
+    source: :class:`nx.Graph`
+    target: :class:`nx.Graph`
+    bond_type: `abc.hashable`
+        under which attribute are the bonding descriptors
+        stored.
+
+    Returns
+    -------
+    ((abc.hashable, abc.hashable), (str, str))
+        the nodes as well as bonding descriptors
+
+    Raises
+    ------
+    LookupError
+        if no match is found
+    """
+    source_nodes = nx.get_node_attributes(source, bond_type)
+    target_nodes = nx.get_node_attributes(target, bond_type)
+    for source_node in source_nodes:
+        for target_node in target_nodes:
+            bond_source = source_nodes[source_node]
+            bond_target = target_nodes[target_node]
+            if compatible(bond_source, bond_target):
+                return ((source_node, target_node), (bond_source, bond_target))
+    raise LookupError
+
+class DefBigSmileParser:
+    """
+    Parse an a string instance of a defined BigSmile,
+    which describes a polymer molecule.
+    """
+
+    def __init__(self):
+        self.force_field = None
+        self.meta_molecule = None
+        self.molecule = None
+
+    def edges_from_bonding_descrpt(self):
+        """
+        Make edges according to the bonding descriptors stored
+        in the node attributes of meta_molecule residue graph.
+        If a bonding descriptor is consumed it is set to None,
+        however, the meta_molecule edge gets an attribute with the
+        bonding descriptors that formed the edge.
+        """
+        for prev_node, node in nx.dfs_edges(self.meta_molecule):
+            edge, bonding = generate_edge(self.meta_molecule.nodes[prev_node]['graph'],
+                                          self.meta_molecule.nodes[node]['graph'])
+            self.meta_molecule.nodes[prev_node]['graph'][edge[0]]['bonding'] = None
+            self.meta_molecule.nodes[prev_node]['graph'][edge[1]]['bonding'] = None
+            self.meta_molecule.molecule.add_edge(edge, bonding=bonding)
+
+    def parse(self, big_smile_str):
+        res_pattern, residues = big_smile_str.split('.')
+        self.meta_molecule = res_pattern_to_meta_mol(res_pattern)
+        self.force_field = force_field_from_fragments(residues)
+        MapToMolecule(self.force_field).run_molecule(self.meta_molecule)
+        self.edges_from_bonding_descrpt()
+        return self.meta_molecule

From 257b76b7665355d217dfb0b71249e64255096a35 Mon Sep 17 00:00:00 2001
From: Fabian Gruenewald <f.grunewald@rug.nl>
Date: Sat, 20 Jan 2024 15:43:12 +0100
Subject: [PATCH 04/25] atest-big-smile parsing part I

---
 polyply/tests/test_big_smile_parsing.py | 64 +++++++++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100644 polyply/tests/test_big_smile_parsing.py

diff --git a/polyply/tests/test_big_smile_parsing.py b/polyply/tests/test_big_smile_parsing.py
new file mode 100644
index 00000000..43045a83
--- /dev/null
+++ b/polyply/tests/test_big_smile_parsing.py
@@ -0,0 +1,64 @@
+import pytest
+import networkx as nx
+from polyply.src.big_smile_parsing import (res_pattern_to_meta_mol,
+                                           tokenize_big_smile)
+
+@pytest.mark.parametrize('smile, nodes, edges',(
+                        # smiple linear seqeunce
+                        ("{[#PMA][#PEO][#PMA]}",
+                        ["PMA", "PEO", "PMA"],
+                        [(0, 1), (1, 2)]),
+                        # simple branched sequence
+                        ("{[#PMA][#PMA]([#PEO][#PEO])[#PMA]}",
+                        ["PMA", "PMA", "PEO", "PEO", "PMA"],
+                        [(0, 1), (1, 2), (2, 3), (1, 4)]),
+                        # simple sequence two branches
+                        ("{[#PMA][#PMA][#PMA]([#PEO][#PEO])([#CH3])[#PMA]}",
+                        ["PMA", "PMA", "PMA", "PEO", "PEO", "CH3", "PMA"],
+                        [(0, 1), (1, 2), (2, 3), (3, 4), (2, 5), (2, 6)]),
+                        # simple linear sequence with expansion
+                        ("{[#PMA]|3}",
+                        ["PMA", "PMA", "PMA"],
+                        [(0, 1), (1, 2)]),
+                       ## simple branched with expansion
+                       #("{[#PMA]([#PEO]|3)|2}",
+                       #["PMA", "PEO", "PEO", "PEO",
+                       # "PMA", "PEO", "PEO", "PEO"],
+                       #[(0, 1), (1, 2), (2, 3),
+                       # (0, 4), (4, 5), (5, 6), (6, 7)]
+                       # )
+))
+def test_res_pattern_to_meta_mol(smile, nodes, edges):
+    """
+    Test that the meta-molecule is correctly reproduced
+    from the simplified smile string syntax.
+    """
+    meta_mol = res_pattern_to_meta_mol(smile)
+    assert len(meta_mol.edges) == len(edges)
+    for edge in edges:
+        assert meta_mol.has_edge(*edge)
+    resnames = nx.get_node_attributes(meta_mol, 'resname')
+    assert nodes == list(resnames.values())
+
+@pytest.mark.parametrize('big_smile, smile, bonding',(
+                        # smiple symmetric bonding
+                        ("[$]COC[$]",
+                         "COC",
+                        {0: '$', 2: '$'}),
+                        # named different bonding descriptors
+                        ("[$1]CCCC[$2]",
+                         "CCCC",
+                        {0: "$1", 3: "$2"}),
+                        # bonding descript. after branch
+                        ("C(COC[$1])[$2]CCC[$3]",
+                         "C(COC)CCC",
+                        {0: '$2', 3: '$1', 6: '$3'}),
+                        # left rigth bonding desciptors
+                        ("[>]COC[<]",
+                        "COC",
+                        {0: '>', 2: '<'})
+))
+def test_tokenize_big_smile(big_smile, smile, bonding):
+    new_smile, new_bonding = tokenize_big_smile(big_smile)
+    assert new_smile == smile
+    assert new_bonding == bonding

From 061c8efc60931b2fff5ef4c866b0c3a952ebded9 Mon Sep 17 00:00:00 2001
From: Fabian Gruenewald <f.grunewald@rug.nl>
Date: Mon, 22 Jan 2024 16:37:32 +0100
Subject: [PATCH 05/25] fix hcount for single atom; fix nexted branches

---
 polyply/src/big_smile_parsing.py | 54 +++++++++++++++++++++++++++-----
 1 file changed, 47 insertions(+), 7 deletions(-)

diff --git a/polyply/src/big_smile_parsing.py b/polyply/src/big_smile_parsing.py
index 2ad65a7b..ddb9bd2a 100644
--- a/polyply/src/big_smile_parsing.py
+++ b/polyply/src/big_smile_parsing.py
@@ -1,4 +1,5 @@
 import re
+import numpy as np
 try:
     import pysmiles
 except ImportError:
@@ -16,6 +17,12 @@
             "fragment": r'#(\w+)=((?:\[.*?\]|[^,\[\]]+)*)',
             "seq_pattern": r'\{([^}]*)\}(?:\.\{([^}]*)\})?'}
 
+def _find_next_character(string, chars, start):
+    for idx, token in enumerate(string[start:]):
+        if token in chars:
+            return idx+start
+    return np.inf
+
 def res_pattern_to_meta_mol(pattern):
     """
     Generate a :class:`polyply.MetaMolecule` from a
@@ -67,13 +74,15 @@ def res_pattern_to_meta_mol(pattern):
     branching = False
     for match in re.finditer(PATTERNS['place_holder'], pattern):
         start, stop = match.span()
+        print(pattern[start:stop])
         # new branch here
         if pattern[start-1] == '(':
             branching = True
             branch_anchor = prev_node
             recipie = [(meta_mol.nodes[prev_node]['resname'], 1)]
         if stop < len(pattern) and pattern[stop] == '|':
-            n_mon = int(pattern[stop+1:pattern.find('[', stop)])
+            eon = _find_next_character(pattern, ['[', ')', '(', '}'], stop)
+            n_mon = int(pattern[stop+1:eon])
         else:
             n_mon = 1
 
@@ -94,12 +103,17 @@ def res_pattern_to_meta_mol(pattern):
             current += 1
 
         # terminate branch and jump back to anchor
-        if stop < len(pattern) and pattern[stop] == ')' and branching:
+        branch_stop = _find_next_character(pattern, ['['], stop) >\
+                      _find_next_character(pattern, [')'], stop)
+        if stop <= len(pattern) and branch_stop and branching:
             branching = False
             prev_node = branch_anchor
             # we have to multiply the branch n-times
-            if stop+1 < len(pattern) and pattern[stop+1] == "|":
-                for _ in range(0,int(pattern[stop+2:pattern.find('[', stop)])):
+            eon_a = _find_next_character(pattern, [')'], stop)
+            if stop+1 < len(pattern) and pattern[eon_a+1] == "|":
+                eon_b = _find_next_character(pattern, ['[', ')', '(', '}'], eon_a+1)
+                # -1 because one branch has already been added at this point
+                for _ in range(0,int(pattern[eon_a+2:eon_b])-1):
                     for bdx, (resname, n_mon) in enumerate(recipie):
                         if bdx == 0:
                             anchor = current
@@ -166,12 +180,36 @@ def tokenize_big_smile(big_smile):
             prev_node = anchor
             smile += token
         else:
-            if token not in '@ . - = # $ : / \\ + - %':
+            if token not in '@ . - = # $ : / \\ + - %'\
+                and not token.isdigit():
                 prev_node = node_count
                 node_count += 1
             smile += token
     return smile, bonding_descrpt
 
+def _rebuild_h_atoms(mol_graph):
+    # special hack around to fix
+    # pysmiles bug for a single
+    # atom molecule; we assume that the
+    # hcount is just wrong and set it to
+    # the valance number minus bonds minus
+    # bonding connectors
+    if len(mol_graph.nodes) == 1:
+        ele = mol_graph.nodes[0]['element']
+        # for N and P we assume the regular valency
+        hcount = pysmiles.smiles_helper.VALENCES[ele][0]
+        if mol_graph.nodes[0].get('bonding', False):
+            hcount -= 1
+        mol_graph.nodes[0]['hcount'] = hcount
+    else:
+        for node in mol_graph.nodes:
+            if mol_graph.nodes[node].get('bonding', False):
+                hcount = mol_graph.nodes[node]['hcount']
+                mol_graph.nodes[node]['hcount'] = hcount - 1
+
+    pysmiles.smiles_helper.add_explicit_hydrogens(mol_graph)
+    return mol_graph
+
 def fragment_iter(fragment_str):
     """
     Iterates over fragments defined in a BigSmile string.
@@ -197,8 +235,10 @@ def fragment_iter(fragment_str):
         big_smile = fragment[delim+1:]
         smile, bonding_descrpt = tokenize_big_smile(big_smile)
         mol_graph = pysmiles.read_smiles(smile)
-        atomnames = [str(node[0])+node[1]['element'] for node in mol_graph.nodes(data=True) ]
         nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding')
+        # we need to rebuild hydrogen atoms now
+        _rebuild_h_atoms(mol_graph)
+        atomnames = {node[0]: node[1]['element']+str(node[0]) for node in mol_graph.nodes(data=True)}
         nx.set_node_attributes(mol_graph, atomnames, 'atomname')
         nx.set_node_attributes(mol_graph, resname, 'resname')
         yield resname, mol_graph
@@ -224,4 +264,4 @@ def force_field_from_fragments(fragment_str):
     for resname, mol_graph in frag_iter:
         mol_block = Block(mol_graph)
         force_field.blocks[resname] = mol_block
-    return forxe_field
+    return force_field

From 20e2e4917500ef8621a696f4395494a3dfd5a6e8 Mon Sep 17 00:00:00 2001
From: Fabian Gruenewald <f.grunewald@rug.nl>
Date: Mon, 22 Jan 2024 16:37:58 +0100
Subject: [PATCH 06/25] tests for smile iter and test nested branches

---
 polyply/tests/test_big_smile_parsing.py | 71 ++++++++++++++++++++++---
 1 file changed, 63 insertions(+), 8 deletions(-)

diff --git a/polyply/tests/test_big_smile_parsing.py b/polyply/tests/test_big_smile_parsing.py
index 43045a83..3265564c 100644
--- a/polyply/tests/test_big_smile_parsing.py
+++ b/polyply/tests/test_big_smile_parsing.py
@@ -1,7 +1,8 @@
 import pytest
 import networkx as nx
 from polyply.src.big_smile_parsing import (res_pattern_to_meta_mol,
-                                           tokenize_big_smile)
+                                           tokenize_big_smile,
+                                           fragment_iter)
 
 @pytest.mark.parametrize('smile, nodes, edges',(
                         # smiple linear seqeunce
@@ -20,13 +21,20 @@
                         ("{[#PMA]|3}",
                         ["PMA", "PMA", "PMA"],
                         [(0, 1), (1, 2)]),
-                       ## simple branched with expansion
-                       #("{[#PMA]([#PEO]|3)|2}",
-                       #["PMA", "PEO", "PEO", "PEO",
-                       # "PMA", "PEO", "PEO", "PEO"],
-                       #[(0, 1), (1, 2), (2, 3),
-                       # (0, 4), (4, 5), (5, 6), (6, 7)]
-                       # )
+                        # simple branch expension
+                        ("{[#PMA]([#PEO][#PEO][#OHter])|2}",
+                        ["PMA", "PEO", "PEO", "OHter",
+                         "PMA", "PEO", "PEO", "OHter"],
+                        [(0, 1), (1, 2), (2, 3),
+                         (0, 4), (4, 5), (5, 6), (6, 7)]
+                         ),
+                        # nested branched with expansion
+                        ("{[#PMA]([#PEO]|3)|2}",
+                        ["PMA", "PEO", "PEO", "PEO",
+                         "PMA", "PEO", "PEO", "PEO"],
+                        [(0, 1), (1, 2), (2, 3),
+                         (0, 4), (4, 5), (5, 6), (6, 7)]
+                         )
 ))
 def test_res_pattern_to_meta_mol(smile, nodes, edges):
     """
@@ -49,6 +57,10 @@ def test_res_pattern_to_meta_mol(smile, nodes, edges):
                         ("[$1]CCCC[$2]",
                          "CCCC",
                         {0: "$1", 3: "$2"}),
+                        # ring and bonding descriptors
+                        ("[$1]CC[$2]C1CCCCC1",
+                         "CCC1CCCCC1",
+                        {0: "$1", 1: "$2"}),
                         # bonding descript. after branch
                         ("C(COC[$1])[$2]CCC[$3]",
                          "C(COC)CCC",
@@ -62,3 +74,46 @@ def test_tokenize_big_smile(big_smile, smile, bonding):
     new_smile, new_bonding = tokenize_big_smile(big_smile)
     assert new_smile == smile
     assert new_bonding == bonding
+
+@pytest.mark.parametrize('fragment_str, nodes, edges',(
+                        # single fragment
+                        ("{#PEO=[$]COC[$]}",
+                        {"PEO": ((0, {"atomname": "C0", "resname": "PEO", "bonding": "$", "element": "C"}),
+                                 (1, {"atomname": "O1", "resname": "PEO", "element": "O"}),
+                                 (2, {"atomname": "C2", "resname": "PEO", "bonding": "$", "element": "C"}),
+                                 (3, {"atomname": "H3", "resname": "PEO", "element": "H"}),
+                                 (4, {"atomname": "H4", "resname": "PEO", "element": "H"}),
+                                 (5, {"atomname": "H5", "resname": "PEO", "element": "H"}),
+                                 (6, {"atomname": "H6", "resname": "PEO", "element": "H"}),
+                                )},
+                        {"PEO": [(0, 1), (1, 2), (0, 3), (0, 4), (2, 5), (2, 6)]}),
+                        # test NH3 terminal
+                        ("{#AMM=N[$]}",
+                        {"AMM": ((0, {"atomname": "N0", "resname": "AMM", "bonding": "$", "element": "N"}),
+                                 (1, {"atomname": "H1", "resname": "AMM", "element": "H"}),
+                                 (2, {"atomname": "H2", "resname": "AMM", "element": "H"}),
+                                )},
+                        {"AMM": [(0, 1), (0, 2)]}),
+                        # single fragment + 1 terminal (i.e. only 1 bonding descrpt
+                        ("{#PEO=[$]COC[$],#OHter=[$][OH]}",
+                        {"PEO": ((0, {"atomname": "C0", "resname": "PEO", "bonding": "$", "element": "C"}),
+                                 (1, {"atomname": "O1", "resname": "PEO", "element": "O"}),
+                                 (2, {"atomname": "C2", "resname": "PEO", "bonding": "$", "element": "C"}),
+                                 (3, {"atomname": "H3", "resname": "PEO", "element": "H"}),
+                                 (4, {"atomname": "H4", "resname": "PEO", "element": "H"}),
+                                 (5, {"atomname": "H5", "resname": "PEO", "element": "H"}),
+                                 (6, {"atomname": "H6", "resname": "PEO", "element": "H"}),
+                                 ),
+                         "OHter": ((0, {"atomname": "O0", "resname": "OHter", "bonding": "$", "element": "O"}),
+                                   (1, {"atomname": "H1", "resname": "OHter", "element": "H"}))},
+                        {"PEO": [(0, 1), (1, 2), (0, 3), (0, 4), (2, 5), (2, 6)],
+                         "OHter": [(0, 1)]}),
+))
+def test_fragment_iter(fragment_str, nodes, edges):
+    for resname, mol_graph in fragment_iter(fragment_str):
+        assert len(mol_graph.nodes) == len(nodes[resname])
+        for node, ref_node in zip(mol_graph.nodes(data=True), nodes[resname]):
+           assert node[0] == ref_node[0]
+           for key in ref_node[1]:
+                assert ref_node[1][key] == node[1][key]
+        assert sorted(mol_graph.edges) == sorted(edges[resname])

From f505129f98cdff951f9137a832cb604f44bce8f9 Mon Sep 17 00:00:00 2001
From: Fabian Gruenewald <f.grunewald@rug.nl>
Date: Mon, 22 Jan 2024 16:40:17 +0100
Subject: [PATCH 07/25] add pysmiles to test requrm.

---
 requirements-tests.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements-tests.txt b/requirements-tests.txt
index 595a4902..03357910 100644
--- a/requirements-tests.txt
+++ b/requirements-tests.txt
@@ -4,3 +4,4 @@ pytest-cov
 pylint
 codecov
 tqdm
+pysmiles

From 0c67ecc17c530c50aa98781fe7df0bb37324e983 Mon Sep 17 00:00:00 2001
From: Fabian Gruenewald <f.grunewald@rug.nl>
Date: Mon, 22 Jan 2024 18:49:28 +0100
Subject: [PATCH 08/25] add tests for bonding descriptor evaluation

---
 polyply/tests/test_big_smile_mol_proc.py | 37 ++++++++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100644 polyply/tests/test_big_smile_mol_proc.py

diff --git a/polyply/tests/test_big_smile_mol_proc.py b/polyply/tests/test_big_smile_mol_proc.py
new file mode 100644
index 00000000..7bcdf9f9
--- /dev/null
+++ b/polyply/tests/test_big_smile_mol_proc.py
@@ -0,0 +1,37 @@
+import pytest
+import networkx as nx
+from polyply.src.big_smile_mol_processor import (DefBigSmileParser,
+                                                 generate_edge)
+
+@pytest.mark.parametrize('bonds_source, bonds_target, edge, btypes',(
+                        # single bond source each
+                        ({0: "$"},
+                         {3: "$"},
+                         (0, 3),
+                         ('$', '$')),
+                        # multiple sources one match
+                        ({0: '$1', 2: '$2'},
+                         {1: '$2', 3: '$'},
+                         (2, 1),
+                         ('$2', '$2')),
+                        # left right selective bonding
+                        ({0: '$', 1: '>', 3: '<'},
+                         {0: '>', 1: '$5'},
+                         (3, 0),
+                         ('<', '>')),
+                        # left right selective bonding
+                        # with identifier
+                        ({0: '$', 1: '>', 3: '<1'},
+                         {0: '>', 1: '$5', 2: '>1'},
+                         (3, 2),
+                         ('<1', '>1')),
+
+))
+def test_generate_edge(bonds_source, bonds_target, edge, btypes):
+    source = nx.path_graph(5)
+    target = nx.path_graph(4)
+    nx.set_node_attributes(source, bonds_source, "bonding")
+    nx.set_node_attributes(target, bonds_target, "bonding")
+    new_edge, new_btypes = generate_edge(source, target, bond_type="bonding")
+    assert new_edge == edge
+    assert new_btypes == btypes

From 52235c91887a87b1be7bea3d50902f9470e286e2 Mon Sep 17 00:00:00 2001
From: Fabian Gruenewald <f.grunewald@rug.nl>
Date: Tue, 23 Jan 2024 18:57:46 +0100
Subject: [PATCH 09/25] add tests for big smile molecule prc

---
 polyply/tests/test_big_smile_mol_proc.py | 66 ++++++++++++++++++++----
 1 file changed, 57 insertions(+), 9 deletions(-)

diff --git a/polyply/tests/test_big_smile_mol_proc.py b/polyply/tests/test_big_smile_mol_proc.py
index 7bcdf9f9..58667ed8 100644
--- a/polyply/tests/test_big_smile_mol_proc.py
+++ b/polyply/tests/test_big_smile_mol_proc.py
@@ -2,27 +2,32 @@
 import networkx as nx
 from polyply.src.big_smile_mol_processor import (DefBigSmileParser,
                                                  generate_edge)
-
+import matplotlib.pyplot as plt
 @pytest.mark.parametrize('bonds_source, bonds_target, edge, btypes',(
                         # single bond source each
-                        ({0: "$"},
-                         {3: "$"},
+                        ({0: ["$"]},
+                         {3: ["$"]},
+                         (0, 3),
+                         ('$', '$')),
+                        # include a None
+                        ({0: ["$"], 1: []},
+                         {3: ["$"]},
                          (0, 3),
                          ('$', '$')),
                         # multiple sources one match
-                        ({0: '$1', 2: '$2'},
-                         {1: '$2', 3: '$'},
+                        ({0: ['$1'], 2: ['$2']},
+                         {1: ['$2'], 3: ['$']},
                          (2, 1),
                          ('$2', '$2')),
                         # left right selective bonding
-                        ({0: '$', 1: '>', 3: '<'},
-                         {0: '>', 1: '$5'},
+                        ({0: ['$'], 1: ['>'], 3: ['<']},
+                         {0: ['>'], 1: ['$5']},
                          (3, 0),
                          ('<', '>')),
                         # left right selective bonding
                         # with identifier
-                        ({0: '$', 1: '>', 3: '<1'},
-                         {0: '>', 1: '$5', 2: '>1'},
+                        ({0: ['$'], 1: ['>'], 3: ['<1']},
+                         {0: ['>'], 1: ['$5'], 2: ['>1']},
                          (3, 2),
                          ('<1', '>1')),
 
@@ -35,3 +40,46 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes):
     new_edge, new_btypes = generate_edge(source, target, bond_type="bonding")
     assert new_edge == edge
     assert new_btypes == btypes
+
+
+@pytest.mark.parametrize('smile, ref_nodes, ref_edges',(
+                        # smiple linear seqeunce
+                        ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]COC[$],#OHter=[$][O]}",
+                        #           0 1             2 3 4 5 6 7 8
+                        [('OHter', 'O H'), ('PEO', 'C O C H H H H'),
+                        #        9 10 11 12 13 14 15         16 17
+                         ('PEO', 'C O C H H H H'), ('OHter', 'O H')],
+                        [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7),
+                         (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13),
+                         (11, 14), (11, 15), (11, 16), (16, 17)]),
+                        # simple branched sequence
+                        ("{[#Hter][#PE]([#PEO][#Hter])[#PE]([#PEO][#Hter])[#Hter]}.{#Hter=[$]H,#PE=[$]CC[$][$],#PEO=[$]COC[$]}",
+                        [('Hter', 'H'), ('PE', 'C C H H H'), ('PEO', 'C O C H H H H'), ('Hter', 'H'),
+                         ('PE', 'C C H H H'), ('PEO', 'C O C H H H H'), ('Hter', 'H'), ('Hter', 'H')],
+                        [(0, 1), (1, 2), (1, 3), (1, 4), (2, 5), (2, 6), (2, 14), (6, 7), (6, 9), (6, 10), (7, 8),
+                         (8, 11), (8, 12), (8, 13), (14, 15), (14, 16), (14, 17), (15, 18), (15, 19), (15, 27),
+                         (19, 20), (19, 22), (19, 23), (20, 21), (21, 24), (21, 25), (21, 26)]),
+                        # something with a ring
+                        #            012 34567
+                        #            890123456
+                        ("{[#Hter][#PS]|2[#Hter]}.{#PS=[$]CC[$]c1ccccc1,#Hter=[$]H}",
+                        [('Hter', 'H'), ('PS', 'C C C C C C C C H H H H H H H H'),
+                         ('PS', 'C C C C C C C C H H H H H H H H'), ('Hter', 'H')],
+                        [(0, 1), (1, 2), (1, 9), (1, 10), (2, 3), (2, 11), (2, 17),
+                         (3, 4), (3, 8), (4, 5), (4, 12), (5, 6), (5, 13), (6, 7),
+                         (6, 14), (7, 8), (7, 15), (8, 16), (17, 18), (17, 25),
+                         (17, 26), (18, 19), (18, 27), (18, 33), (19, 20), (19, 24),
+                         (20, 21), (20, 28), (21, 22), (21, 29), (22, 23), (22, 30),
+                         (23, 24), (23, 31), (24, 32)]),
+
+))
+def test_def_big_smile_parser(smile, ref_nodes, ref_edges):
+    meta_mol = DefBigSmileParser().parse(smile)
+    for node, ref in zip(meta_mol.nodes, ref_nodes):
+        assert meta_mol.nodes[node]['resname'] ==  ref[0]
+        block_graph = meta_mol.nodes[node]['graph']
+        elements = list(nx.get_node_attributes(block_graph, 'element').values())
+        assert elements == ref[1].split()
+    #nx.draw_networkx(meta_mol.molecule, with_labels=True, labels=nx.get_node_attributes(meta_mol.molecule, 'element'))
+    #plt.show()
+    assert sorted(meta_mol.molecule.edges) == sorted(ref_edges)

From 9a0a674fa685af88df1567d9051a0bbb308e80d4 Mon Sep 17 00:00:00 2001
From: Fabian Gruenewald <f.grunewald@rug.nl>
Date: Tue, 23 Jan 2024 18:58:18 +0100
Subject: [PATCH 10/25] allow multiple bonding per atom; fix bugs

---
 polyply/src/big_smile_mol_processor.py | 117 +++++++++++++++++++++++++
 polyply/src/big_smile_parsing.py       |  22 +++--
 2 files changed, 132 insertions(+), 7 deletions(-)
 create mode 100644 polyply/src/big_smile_mol_processor.py

diff --git a/polyply/src/big_smile_mol_processor.py b/polyply/src/big_smile_mol_processor.py
new file mode 100644
index 00000000..8499e7e3
--- /dev/null
+++ b/polyply/src/big_smile_mol_processor.py
@@ -0,0 +1,117 @@
+import networkx as nx
+from polyply.src.big_smile_parsing import (res_pattern_to_meta_mol,
+                                           force_field_from_fragments)
+from polyply.src.map_to_molecule import MapToMolecule
+
+def compatible(left, right):
+    """
+    Check bonding descriptor compatibility according
+    to the BigSmiles syntax convetions.
+
+    Parameters
+    ----------
+    left: str
+    right: str
+
+    Returns
+    -------
+    bool
+    """
+    if left == right and left not in '> <':
+        return True
+    if left[0] == "<" and right[0] == ">":
+        if left[1:] == right[1:]:
+            return True
+    if left[0] == ">" and right[0] == "<":
+        if left[1:] == right[1:]:
+            return True
+    return False
+
+def generate_edge(source, target, bond_type="bonding"):
+    """
+    Given a source and a target graph, which have bonding
+    descriptors stored as node attributes, find a pair of
+    matching descriptors and return the respective nodes.
+    The function also returns the bonding descriptors. If
+    no bonding descriptor is found an instance of LookupError
+    is raised.
+
+    Parameters
+    ----------
+    source: :class:`nx.Graph`
+    target: :class:`nx.Graph`
+    bond_type: `abc.hashable`
+        under which attribute are the bonding descriptors
+        stored.
+
+    Returns
+    -------
+    ((abc.hashable, abc.hashable), (str, str))
+        the nodes as well as bonding descriptors
+
+    Raises
+    ------
+    LookupError
+        if no match is found
+    """
+    source_nodes = nx.get_node_attributes(source, bond_type)
+    target_nodes = nx.get_node_attributes(target, bond_type)
+    for source_node in source_nodes:
+        for target_node in target_nodes:
+            #print(source_node, target_node)
+            bond_sources = source_nodes[source_node]
+            bond_targets = target_nodes[target_node]
+            for bond_source in bond_sources:
+                for bond_target in bond_targets:
+                    #print(bond_source, bond_target)
+                    if compatible(bond_source, bond_target):
+                        return ((source_node, target_node), (bond_source, bond_target))
+    raise LookupError
+
+class DefBigSmileParser:
+    """
+    Parse an a string instance of a defined BigSmile,
+    which describes a polymer molecule.
+    """
+
+    def __init__(self):
+        self.force_field = None
+        self.meta_molecule = None
+        self.molecule = None
+
+    def edges_from_bonding_descrpt(self):
+        """
+        Make edges according to the bonding descriptors stored
+        in the node attributes of meta_molecule residue graph.
+        If a bonding descriptor is consumed it is set to None,
+        however, the meta_molecule edge gets an attribute with the
+        bonding descriptors that formed the edge.
+        """
+        for prev_node, node in nx.dfs_edges(self.meta_molecule):
+            prev_graph = self.meta_molecule.nodes[prev_node]['graph']
+            node_graph = self.meta_molecule.nodes[node]['graph']
+            edge, bonding = generate_edge(prev_graph,
+                                          node_graph)
+            # this is a bit of a workaround because at this stage the
+            # bonding list is actually shared between all residues of
+            # of the same type; so we first make a copy then we replace
+            # the list sans used bonding descriptor
+            prev_bond_list = prev_graph.nodes[edge[0]]['bonding'].copy()
+            prev_bond_list.remove(bonding[0])
+            prev_graph.nodes[edge[0]]['bonding'] = prev_bond_list
+            node_bond_list = node_graph.nodes[edge[1]]['bonding'].copy()
+            node_bond_list.remove(bonding[1])
+            node_graph.nodes[edge[1]]['bonding'] = node_bond_list
+            self.meta_molecule.molecule.add_edge(edge[0], edge[1], bonding=bonding)
+
+    def parse(self, big_smile_str):
+        res_pattern, residues = big_smile_str.split('.')
+        self.meta_molecule = res_pattern_to_meta_mol(res_pattern)
+        self.force_field = force_field_from_fragments(residues)
+        MapToMolecule(self.force_field).run_molecule(self.meta_molecule)
+        self.edges_from_bonding_descrpt()
+        return self.meta_molecule
+
+# ToDo
+# - replace non consumed bonding descrpt by hydrogen
+# - 
diff --git a/polyply/src/big_smile_parsing.py b/polyply/src/big_smile_parsing.py
index ddb9bd2a..fa6348cc 100644
--- a/polyply/src/big_smile_parsing.py
+++ b/polyply/src/big_smile_parsing.py
@@ -1,3 +1,4 @@
+from collections import defaultdict
 import re
 import numpy as np
 try:
@@ -154,7 +155,7 @@ def tokenize_big_smile(big_smile):
         to the nodes within the smile
     """
     smile_iter = _big_smile_iter(big_smile)
-    bonding_descrpt = {}
+    bonding_descrpt = defaultdict(list)
     smile = ""
     node_count = 0
     prev_node = 0
@@ -167,7 +168,7 @@ def tokenize_big_smile(big_smile):
                 while peek != ']':
                     bond_descrp += peek
                     peek = next(smile_iter)
-                bonding_descrpt[prev_node] = bond_descrp
+                bonding_descrpt[prev_node].append(bond_descrp)
             else:
                 smile = smile + token + peek
                 prev_node = node_count
@@ -205,7 +206,7 @@ def _rebuild_h_atoms(mol_graph):
         for node in mol_graph.nodes:
             if mol_graph.nodes[node].get('bonding', False):
                 hcount = mol_graph.nodes[node]['hcount']
-                mol_graph.nodes[node]['hcount'] = hcount - 1
+                mol_graph.nodes[node]['hcount'] = hcount - len(mol_graph.nodes[node]['bonding'])
 
     pysmiles.smiles_helper.add_explicit_hydrogens(mol_graph)
     return mol_graph
@@ -234,10 +235,17 @@ def fragment_iter(fragment_str):
         resname = fragment[1:delim]
         big_smile = fragment[delim+1:]
         smile, bonding_descrpt = tokenize_big_smile(big_smile)
-        mol_graph = pysmiles.read_smiles(smile)
-        nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding')
-        # we need to rebuild hydrogen atoms now
-        _rebuild_h_atoms(mol_graph)
+
+        if smile == "H":
+            mol_graph = nx.Graph()
+            mol_graph.add_node(0, element="H", bonding=bonding_descrpt[0])
+            nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding')
+        else:
+            mol_graph = pysmiles.read_smiles(smile)
+            nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding')
+            # we need to rebuild hydrogen atoms now
+            _rebuild_h_atoms(mol_graph)
+
         atomnames = {node[0]: node[1]['element']+str(node[0]) for node in mol_graph.nodes(data=True)}
         nx.set_node_attributes(mol_graph, atomnames, 'atomname')
         nx.set_node_attributes(mol_graph, resname, 'resname')

From ceccc3d53fab73921c87bfd29a885fda7e284726 Mon Sep 17 00:00:00 2001
From: Fabian Gruenewald <f.grunewald@rug.nl>
Date: Wed, 24 Jan 2024 10:59:53 +0100
Subject: [PATCH 11/25] remove mpl import

---
 polyply/tests/test_big_smile_mol_proc.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/polyply/tests/test_big_smile_mol_proc.py b/polyply/tests/test_big_smile_mol_proc.py
index 58667ed8..6975b885 100644
--- a/polyply/tests/test_big_smile_mol_proc.py
+++ b/polyply/tests/test_big_smile_mol_proc.py
@@ -2,7 +2,7 @@
 import networkx as nx
 from polyply.src.big_smile_mol_processor import (DefBigSmileParser,
                                                  generate_edge)
-import matplotlib.pyplot as plt
+#import matplotlib.pyplot as plt
 @pytest.mark.parametrize('bonds_source, bonds_target, edge, btypes',(
                         # single bond source each
                         ({0: ["$"]},

From 158fd3734f321d2084d7b466297dfe4c0c851d30 Mon Sep 17 00:00:00 2001
From: Fabian Gruenewald <f.grunewald@rug.nl>
Date: Wed, 24 Jan 2024 11:12:13 +0100
Subject: [PATCH 12/25] add changed tests for multiple bonding per atom

---
 polyply/tests/test_big_smile_parsing.py | 40 ++++++++++++++++++-------
 1 file changed, 29 insertions(+), 11 deletions(-)

diff --git a/polyply/tests/test_big_smile_parsing.py b/polyply/tests/test_big_smile_parsing.py
index 3265564c..f7faf71a 100644
--- a/polyply/tests/test_big_smile_parsing.py
+++ b/polyply/tests/test_big_smile_parsing.py
@@ -52,23 +52,27 @@ def test_res_pattern_to_meta_mol(smile, nodes, edges):
                         # smiple symmetric bonding
                         ("[$]COC[$]",
                          "COC",
-                        {0: '$', 2: '$'}),
+                        {0: ["$"], 2: ["$"]}),
+                        # smiple symmetric bonding; multiple descript
+                        ("[$]COC[$][$1]",
+                         "COC",
+                        {0: ["$"], 2: ["$", "$1"]}),
                         # named different bonding descriptors
                         ("[$1]CCCC[$2]",
                          "CCCC",
-                        {0: "$1", 3: "$2"}),
+                        {0: ["$1"], 3: ["$2"]}),
                         # ring and bonding descriptors
                         ("[$1]CC[$2]C1CCCCC1",
                          "CCC1CCCCC1",
-                        {0: "$1", 1: "$2"}),
+                        {0: ["$1"], 1: ["$2"]}),
                         # bonding descript. after branch
                         ("C(COC[$1])[$2]CCC[$3]",
                          "C(COC)CCC",
-                        {0: '$2', 3: '$1', 6: '$3'}),
+                        {0: ["$2"], 3: ["$1"], 6: ["$3"]}),
                         # left rigth bonding desciptors
                         ("[>]COC[<]",
                         "COC",
-                        {0: '>', 2: '<'})
+                        {0: [">"], 2: ["<"]})
 ))
 def test_tokenize_big_smile(big_smile, smile, bonding):
     new_smile, new_bonding = tokenize_big_smile(big_smile)
@@ -78,9 +82,9 @@ def test_tokenize_big_smile(big_smile, smile, bonding):
 @pytest.mark.parametrize('fragment_str, nodes, edges',(
                         # single fragment
                         ("{#PEO=[$]COC[$]}",
-                        {"PEO": ((0, {"atomname": "C0", "resname": "PEO", "bonding": "$", "element": "C"}),
+                        {"PEO": ((0, {"atomname": "C0", "resname": "PEO", "bonding": ["$"], "element": "C"}),
                                  (1, {"atomname": "O1", "resname": "PEO", "element": "O"}),
-                                 (2, {"atomname": "C2", "resname": "PEO", "bonding": "$", "element": "C"}),
+                                 (2, {"atomname": "C2", "resname": "PEO", "bonding": ["$"], "element": "C"}),
                                  (3, {"atomname": "H3", "resname": "PEO", "element": "H"}),
                                  (4, {"atomname": "H4", "resname": "PEO", "element": "H"}),
                                  (5, {"atomname": "H5", "resname": "PEO", "element": "H"}),
@@ -89,25 +93,39 @@ def test_tokenize_big_smile(big_smile, smile, bonding):
                         {"PEO": [(0, 1), (1, 2), (0, 3), (0, 4), (2, 5), (2, 6)]}),
                         # test NH3 terminal
                         ("{#AMM=N[$]}",
-                        {"AMM": ((0, {"atomname": "N0", "resname": "AMM", "bonding": "$", "element": "N"}),
+                        {"AMM": ((0, {"atomname": "N0", "resname": "AMM", "bonding": ["$"], "element": "N"}),
                                  (1, {"atomname": "H1", "resname": "AMM", "element": "H"}),
                                  (2, {"atomname": "H2", "resname": "AMM", "element": "H"}),
                                 )},
                         {"AMM": [(0, 1), (0, 2)]}),
                         # single fragment + 1 terminal (i.e. only 1 bonding descrpt
                         ("{#PEO=[$]COC[$],#OHter=[$][OH]}",
-                        {"PEO": ((0, {"atomname": "C0", "resname": "PEO", "bonding": "$", "element": "C"}),
+                        {"PEO": ((0, {"atomname": "C0", "resname": "PEO", "bonding": ["$"], "element": "C"}),
                                  (1, {"atomname": "O1", "resname": "PEO", "element": "O"}),
-                                 (2, {"atomname": "C2", "resname": "PEO", "bonding": "$", "element": "C"}),
+                                 (2, {"atomname": "C2", "resname": "PEO", "bonding": ["$"], "element": "C"}),
                                  (3, {"atomname": "H3", "resname": "PEO", "element": "H"}),
                                  (4, {"atomname": "H4", "resname": "PEO", "element": "H"}),
                                  (5, {"atomname": "H5", "resname": "PEO", "element": "H"}),
                                  (6, {"atomname": "H6", "resname": "PEO", "element": "H"}),
                                  ),
-                         "OHter": ((0, {"atomname": "O0", "resname": "OHter", "bonding": "$", "element": "O"}),
+                         "OHter": ((0, {"atomname": "O0", "resname": "OHter", "bonding": ["$"], "element": "O"}),
                                    (1, {"atomname": "H1", "resname": "OHter", "element": "H"}))},
                         {"PEO": [(0, 1), (1, 2), (0, 3), (0, 4), (2, 5), (2, 6)],
                          "OHter": [(0, 1)]}),
+                        # single fragment + 1 terminal but multiple bond descritp.
+                        # this adjust the hydrogen count
+                        ("{#PEO=[$]COC[$][$1],#OHter=[$][OH]}",
+                        {"PEO": ((0, {"atomname": "C0", "resname": "PEO", "bonding": ["$"], "element": "C"}),
+                                 (1, {"atomname": "O1", "resname": "PEO", "element": "O"}),
+                                 (2, {"atomname": "C2", "resname": "PEO", "bonding": ["$", "$1"], "element": "C"}),
+                                 (3, {"atomname": "H3", "resname": "PEO", "element": "H"}),
+                                 (4, {"atomname": "H4", "resname": "PEO", "element": "H"}),
+                                 (5, {"atomname": "H5", "resname": "PEO", "element": "H"}),
+                                 ),
+                         "OHter": ((0, {"atomname": "O0", "resname": "OHter", "bonding": ["$"], "element": "O"}),
+                                   (1, {"atomname": "H1", "resname": "OHter", "element": "H"}))},
+                        {"PEO": [(0, 1), (1, 2), (0, 3), (0, 4), (2, 5)],
+                         "OHter": [(0, 1)]}),
 ))
 def test_fragment_iter(fragment_str, nodes, edges):
     for resname, mol_graph in fragment_iter(fragment_str):

From 8f2887f5d2149e94330014a8b32f47d64caf1b3d Mon Sep 17 00:00:00 2001
From: Fabian Gruenewald <f.grunewald@rug.nl>
Date: Wed, 24 Jan 2024 12:13:41 +0100
Subject: [PATCH 13/25] delete old processor file

---
 polyply/src/big_smile_mol_processsor.py | 99 -------------------------
 1 file changed, 99 deletions(-)
 delete mode 100644 polyply/src/big_smile_mol_processsor.py

diff --git a/polyply/src/big_smile_mol_processsor.py b/polyply/src/big_smile_mol_processsor.py
deleted file mode 100644
index 8131e009..00000000
--- a/polyply/src/big_smile_mol_processsor.py
+++ /dev/null
@@ -1,99 +0,0 @@
-import networkx as nx
-from polyply.src.big_smile_parsing import (res_pattern_to_meta_mol,
-                                           force_field_from_fragments)
-from polyply.src.map_to_molecule import MapToMolecule
-
-def compatible(left, right):
-    """
-    Check bonding descriptor compatibility according
-    to the BigSmiles syntax convetions.
-
-    Parameters
-    ----------
-    left: str
-    right: str
-
-    Returns
-    -------
-    bool
-    """
-    if left == right:
-        return True
-    if left[0] == "<" and right[0] == ">":
-        if left[1:] == right[1:]:
-            return True
-    if left[0] == ">" and right[0] == "<":
-        if left[1:] == right[1:]:
-            return True
-    return False
-
-def generate_edge(source, target, bond_type="bonding"):
-    """
-    Given a source and a target graph, which have bonding
-    descriptors stored as node attributes, find a pair of
-    matching descriptors and return the respective nodes.
-    The function also returns the bonding descriptors. If
-    no bonding descriptor is found an instance of LookupError
-    is raised.
-
-    Parameters
-    ----------
-    source: :class:`nx.Graph`
-    target: :class:`nx.Graph`
-    bond_type: `abc.hashable`
-        under which attribute are the bonding descriptors
-        stored.
-
-    Returns
-    -------
-    ((abc.hashable, abc.hashable), (str, str))
-        the nodes as well as bonding descriptors
-
-    Raises
-    ------
-    LookupError
-        if no match is found
-    """
-    source_nodes = nx.get_node_attributes(source, bond_type)
-    target_nodes = nx.get_node_attributes(target, bond_type)
-    for source_node in source_nodes:
-        for target_node in target_nodes:
-            bond_source = source_nodes[source_node]
-            bond_target = target_nodes[target_node]
-            if compatible(bond_source, bond_target):
-                return ((source_node, target_node), (bond_source, bond_target))
-    raise LookupError
-
-class DefBigSmileParser:
-    """
-    Parse an a string instance of a defined BigSmile,
-    which describes a polymer molecule.
-    """
-
-    def __init__(self):
-        self.force_field = None
-        self.meta_molecule = None
-        self.molecule = None
-
-    def edges_from_bonding_descrpt(self):
-        """
-        Make edges according to the bonding descriptors stored
-        in the node attributes of meta_molecule residue graph.
-        If a bonding descriptor is consumed it is set to None,
-        however, the meta_molecule edge gets an attribute with the
-        bonding descriptors that formed the edge.
-        """
-        for prev_node, node in nx.dfs_edges(self.meta_molecule):
-            edge, bonding = generate_edge(self.meta_molecule.nodes[prev_node]['graph'],
-                                          self.meta_molecule.nodes[node]['graph'])
-            self.meta_molecule.nodes[prev_node]['graph'][edge[0]]['bonding'] = None
-            self.meta_molecule.nodes[prev_node]['graph'][edge[1]]['bonding'] = None
-            self.meta_molecule.molecule.add_edge(edge, bonding=bonding)
-
-    def parse(self, big_smile_str):
-        res_pattern, residues = big_smile_str.split('.')
-        self.meta_molecule = res_pattern_to_meta_mol(res_pattern)
-        self.force_field = force_field_from_fragments(residues)
-        MapToMolecule(self.force_field).run_molecule(self.meta_molecule)
-        self.edges_from_bonding_descrpt()
-        return self.meta_molecule

From 7cb3b4cf32fc8eb52b42b4b8a795fafb3cf7faa6 Mon Sep 17 00:00:00 2001
From: Fabian Gruenewald <f.grunewald@rug.nl>
Date: Sun, 3 Mar 2024 14:30:49 +0100
Subject: [PATCH 14/25] add H and ] as special characters in big smile parser

---
 polyply/src/big_smile_parsing.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/polyply/src/big_smile_parsing.py b/polyply/src/big_smile_parsing.py
index fa6348cc..6969a31c 100644
--- a/polyply/src/big_smile_parsing.py
+++ b/polyply/src/big_smile_parsing.py
@@ -75,7 +75,6 @@ def res_pattern_to_meta_mol(pattern):
     branching = False
     for match in re.finditer(PATTERNS['place_holder'], pattern):
         start, stop = match.span()
-        print(pattern[start:stop])
         # new branch here
         if pattern[start-1] == '(':
             branching = True
@@ -181,7 +180,7 @@ def tokenize_big_smile(big_smile):
             prev_node = anchor
             smile += token
         else:
-            if token not in '@ . - = # $ : / \\ + - %'\
+            if token not in '] H @ . - = # $ : / \\ + - %'\
                 and not token.isdigit():
                 prev_node = node_count
                 node_count += 1

From 097ec842efefb5d4e5ca8b32d38365174f9af10a Mon Sep 17 00:00:00 2001
From: Fabian Gruenewald <f.grunewald@rug.nl>
Date: Sun, 3 Mar 2024 14:54:11 +0100
Subject: [PATCH 15/25] account for explicit hydrogen in the smiles string
 input

---
 polyply/src/big_smile_parsing.py        | 11 +++++++--
 polyply/tests/test_big_smile_parsing.py | 30 +++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/polyply/src/big_smile_parsing.py b/polyply/src/big_smile_parsing.py
index 6969a31c..55d3a6aa 100644
--- a/polyply/src/big_smile_parsing.py
+++ b/polyply/src/big_smile_parsing.py
@@ -204,8 +204,15 @@ def _rebuild_h_atoms(mol_graph):
     else:
         for node in mol_graph.nodes:
             if mol_graph.nodes[node].get('bonding', False):
-                hcount = mol_graph.nodes[node]['hcount']
-                mol_graph.nodes[node]['hcount'] = hcount - len(mol_graph.nodes[node]['bonding'])
+                # get the degree
+                ele = mol_graph.nodes[0]['element']
+                # hcoung is the valance minus the degree minus
+                # the number of bonding descriptors
+                hcount = pysmiles.smiles_helper.VALENCES[ele][0] -\
+                         mol_graph.degree(node) -\
+                         len(mol_graph.nodes[node]['bonding'])
+
+                mol_graph.nodes[node]['hcount'] = hcount
 
     pysmiles.smiles_helper.add_explicit_hydrogens(mol_graph)
     return mol_graph
diff --git a/polyply/tests/test_big_smile_parsing.py b/polyply/tests/test_big_smile_parsing.py
index f7faf71a..ba3f5f69 100644
--- a/polyply/tests/test_big_smile_parsing.py
+++ b/polyply/tests/test_big_smile_parsing.py
@@ -53,6 +53,10 @@ def test_res_pattern_to_meta_mol(smile, nodes, edges):
                         ("[$]COC[$]",
                          "COC",
                         {0: ["$"], 2: ["$"]}),
+                        # simple symmetric but with explicit hydrogen
+                        ("[$][CH2]O[CH2][$]",
+                         "[CH2]O[CH2]",
+                        {0: ["$"], 2: ["$"]}),
                         # smiple symmetric bonding; multiple descript
                         ("[$]COC[$][$1]",
                          "COC",
@@ -91,6 +95,17 @@ def test_tokenize_big_smile(big_smile, smile, bonding):
                                  (6, {"atomname": "H6", "resname": "PEO", "element": "H"}),
                                 )},
                         {"PEO": [(0, 1), (1, 2), (0, 3), (0, 4), (2, 5), (2, 6)]}),
+                        # single fragment but with explicit hydrogen in smiles
+                        ("{#PEO=[$][CH2]O[CH2][$]}",
+                        {"PEO": ((0, {"atomname": "C0", "resname": "PEO", "bonding": ["$"], "element": "C"}),
+                                 (1, {"atomname": "O1", "resname": "PEO", "element": "O"}),
+                                 (2, {"atomname": "C2", "resname": "PEO", "bonding": ["$"], "element": "C"}),
+                                 (3, {"atomname": "H3", "resname": "PEO", "element": "H"}),
+                                 (4, {"atomname": "H4", "resname": "PEO", "element": "H"}),
+                                 (5, {"atomname": "H5", "resname": "PEO", "element": "H"}),
+                                 (6, {"atomname": "H6", "resname": "PEO", "element": "H"}),
+                                )},
+                        {"PEO": [(0, 1), (1, 2), (0, 3), (0, 4), (2, 5), (2, 6)]}),
                         # test NH3 terminal
                         ("{#AMM=N[$]}",
                         {"AMM": ((0, {"atomname": "N0", "resname": "AMM", "bonding": ["$"], "element": "N"}),
@@ -126,6 +141,21 @@ def test_tokenize_big_smile(big_smile, smile, bonding):
                                    (1, {"atomname": "H1", "resname": "OHter", "element": "H"}))},
                         {"PEO": [(0, 1), (1, 2), (0, 3), (0, 4), (2, 5)],
                          "OHter": [(0, 1)]}),
+                        # single fragment + 1 terminal but multiple bond descritp.
+                        # but explicit hydrogen in the smiles string
+                        ("{#PEO=[$][CH2]O[CH2][$][$1],#OHter=[$][OH]}",
+                        {"PEO": ((0, {"atomname": "C0", "resname": "PEO", "bonding": ["$"], "element": "C"}),
+                                 (1, {"atomname": "O1", "resname": "PEO", "element": "O"}),
+                                 (2, {"atomname": "C2", "resname": "PEO", "bonding": ["$", "$1"], "element": "C"}),
+                                 (3, {"atomname": "H3", "resname": "PEO", "element": "H"}),
+                                 (4, {"atomname": "H4", "resname": "PEO", "element": "H"}),
+                                 (5, {"atomname": "H5", "resname": "PEO", "element": "H"}),
+                                 ),
+                         "OHter": ((0, {"atomname": "O0", "resname": "OHter", "bonding": ["$"], "element": "O"}),
+                                   (1, {"atomname": "H1", "resname": "OHter", "element": "H"}))},
+                        {"PEO": [(0, 1), (1, 2), (0, 3), (0, 4), (2, 5)],
+                         "OHter": [(0, 1)]}),
+
 ))
 def test_fragment_iter(fragment_str, nodes, edges):
     for resname, mol_graph in fragment_iter(fragment_str):

From 514ba1b2408da1bf592845719b05baeb4dc61d12 Mon Sep 17 00:00:00 2001
From: Fabian Gruenewald <f.grunewald@rug.nl>
Date: Sun, 3 Mar 2024 15:57:12 +0100
Subject: [PATCH 16/25] test accounting for explicit hydrogen in the smiles
 string input

---
 polyply/src/big_smile_mol_processor.py   | 25 ++++++++++++++++++++----
 polyply/tests/test_big_smile_mol_proc.py | 13 ++++++++++--
 2 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/polyply/src/big_smile_mol_processor.py b/polyply/src/big_smile_mol_processor.py
index 8499e7e3..f474fe76 100644
--- a/polyply/src/big_smile_mol_processor.py
+++ b/polyply/src/big_smile_mol_processor.py
@@ -104,14 +104,31 @@ def edges_from_bonding_descrpt(self):
             node_graph.nodes[edge[1]]['bonding'] = node_bond_list
             self.meta_molecule.molecule.add_edge(edge[0], edge[1], bonding=bonding)
 
+    def replace_unconsumed_bonding_descrpt(self):
+        """
+        We allow multiple bonding descriptors per atom, which
+        however, are not always consumed. In this case the left
+        over bonding descriptors are replaced by hydrogen atoms.
+        """
+        for node in self.meta_molecule.nodes:
+            graph = self.meta_molecule.nodes[node]['graph']
+            bonding = nx.get_node_attributes(graph, "bonding")
+            for node, bondings in bonding.items():
+                attrs = {attr: graph.nodes[node][attr] for attr in ['resname', 'resid']}
+                attrs['element'] = 'H'
+                for new_id in range(1, len(bondings)+1):
+                    new_node = len(self.meta_molecule.molecule.nodes) + 1
+                    graph.add_edge(node, new_node)
+                    attrs['atomname'] = "H" + str(new_id + len(graph.nodes))
+                    graph.nodes[new_node].update(attrs)
+                    self.meta_molecule.molecule.add_edge(node, new_node)
+                    self.meta_molecule.molecule.nodes[new_node].update(attrs)
+
     def parse(self, big_smile_str):
         res_pattern, residues = big_smile_str.split('.')
         self.meta_molecule = res_pattern_to_meta_mol(res_pattern)
         self.force_field = force_field_from_fragments(residues)
         MapToMolecule(self.force_field).run_molecule(self.meta_molecule)
         self.edges_from_bonding_descrpt()
+        self.replace_unconsumed_bonding_descrpt()
         return self.meta_molecule
-
-# ToDo
-# - replace non consumed bonding descrpt by hydrogen
-# - 
diff --git a/polyply/tests/test_big_smile_mol_proc.py b/polyply/tests/test_big_smile_mol_proc.py
index 6975b885..26e85ba6 100644
--- a/polyply/tests/test_big_smile_mol_proc.py
+++ b/polyply/tests/test_big_smile_mol_proc.py
@@ -52,6 +52,15 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes):
                         [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7),
                          (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13),
                          (11, 14), (11, 15), (11, 16), (16, 17)]),
+                        # uncomsumed bonding IDs; note that this is not the same
+                        # molecule as previous test case. Here one of the OH branches
+                        # and replaces an CH2 group with CH-OH
+                        ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[>][$1]COC[<],#OHter=[$1][O]}",
+                        [('OHter', 'O H'), ('PEO', 'C O C H H H H'),
+                         ('PEO', 'C O C H H H H'), ('OHter', 'O H')],
+                        [(0, 1), (0, 2), (2, 3), (2, 5), (2, 10), (3, 4),
+                         (4, 6), (4, 7), (4, 17), (8, 9), (8, 11), (8, 14),
+                         (8, 18), (9, 10), (10, 12), (10, 13), (14, 15)]),
                         # simple branched sequence
                         ("{[#Hter][#PE]([#PEO][#Hter])[#PE]([#PEO][#Hter])[#Hter]}.{#Hter=[$]H,#PE=[$]CC[$][$],#PEO=[$]COC[$]}",
                         [('Hter', 'H'), ('PE', 'C C H H H'), ('PEO', 'C O C H H H H'), ('Hter', 'H'),
@@ -75,11 +84,11 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes):
 ))
 def test_def_big_smile_parser(smile, ref_nodes, ref_edges):
     meta_mol = DefBigSmileParser().parse(smile)
+#    nx.draw_networkx(meta_mol.molecule, with_labels=True, labels=nx.get_node_attributes(meta_mol.molecule, 'element'))
+#    plt.show()
     for node, ref in zip(meta_mol.nodes, ref_nodes):
         assert meta_mol.nodes[node]['resname'] ==  ref[0]
         block_graph = meta_mol.nodes[node]['graph']
         elements = list(nx.get_node_attributes(block_graph, 'element').values())
         assert elements == ref[1].split()
-    #nx.draw_networkx(meta_mol.molecule, with_labels=True, labels=nx.get_node_attributes(meta_mol.molecule, 'element'))
-    #plt.show()
     assert sorted(meta_mol.molecule.edges) == sorted(ref_edges)

From d97632d57a427479f1a77fe380acc41df97ef3d1 Mon Sep 17 00:00:00 2001
From: Fabian Gruenewald <f.grunewald@rug.nl>
Date: Mon, 4 Mar 2024 15:35:24 +0100
Subject: [PATCH 17/25] adjust doc string

---
 polyply/src/big_smile_mol_processor.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/polyply/src/big_smile_mol_processor.py b/polyply/src/big_smile_mol_processor.py
index f474fe76..b533e818 100644
--- a/polyply/src/big_smile_mol_processor.py
+++ b/polyply/src/big_smile_mol_processor.py
@@ -83,9 +83,10 @@ def edges_from_bonding_descrpt(self):
         """
         Make edges according to the bonding descriptors stored
         in the node attributes of meta_molecule residue graph.
-        If a bonding descriptor is consumed it is set to None,
+        If a bonding descriptor is consumed it is removed from the list,
         however, the meta_molecule edge gets an attribute with the
-        bonding descriptors that formed the edge.
+        bonding descriptors that formed the edge. Later uncomsumed
+        bonding descriptors are replaced by hydrogen atoms.
         """
         for prev_node, node in nx.dfs_edges(self.meta_molecule):
             prev_graph = self.meta_molecule.nodes[prev_node]['graph']

From c4f16527532172f887689b86bc3659284ce33f97 Mon Sep 17 00:00:00 2001
From: Fabian Gruenewald <f.grunewald@rug.nl>
Date: Wed, 6 Mar 2024 18:00:36 +0100
Subject: [PATCH 18/25] parse force-field in molprocessor, adjust hydrogen
 reconstruction

---
 polyply/src/big_smile_mol_processor.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/polyply/src/big_smile_mol_processor.py b/polyply/src/big_smile_mol_processor.py
index b533e818..640c40e1 100644
--- a/polyply/src/big_smile_mol_processor.py
+++ b/polyply/src/big_smile_mol_processor.py
@@ -1,8 +1,12 @@
 import networkx as nx
+import pysmiles
 from polyply.src.big_smile_parsing import (res_pattern_to_meta_mol,
                                            force_field_from_fragments)
 from polyply.src.map_to_molecule import MapToMolecule
 
+VALENCES = pysmiles.smiles_helper.VALENCES
+VALENCES.update({"H":(1,)})
+
 def compatible(left, right):
     """
     Check bonding descriptor compatibility according
@@ -74,8 +78,8 @@ class DefBigSmileParser:
     which describes a polymer molecule.
     """
 
-    def __init__(self):
-        self.force_field = None
+    def __init__(self, force_field):
+        self.force_field = force_field
         self.meta_molecule = None
         self.molecule = None
 
@@ -115,9 +119,12 @@ def replace_unconsumed_bonding_descrpt(self):
             graph = self.meta_molecule.nodes[node]['graph']
             bonding = nx.get_node_attributes(graph, "bonding")
             for node, bondings in bonding.items():
+                element = graph.nodes[node]['element']
+                hcount = VALENCES[element][0] -\
+                         self.meta_molecule.molecule.degree(node) + 1
                 attrs = {attr: graph.nodes[node][attr] for attr in ['resname', 'resid']}
                 attrs['element'] = 'H'
-                for new_id in range(1, len(bondings)+1):
+                for new_id in range(1, hcount):
                     new_node = len(self.meta_molecule.molecule.nodes) + 1
                     graph.add_edge(node, new_node)
                     attrs['atomname'] = "H" + str(new_id + len(graph.nodes))

From 7a5dd1f74e76e8c15103a5ad93e9490d253403bf Mon Sep 17 00:00:00 2001
From: Fabian Gruenewald <f.grunewald@rug.nl>
Date: Wed, 6 Mar 2024 19:12:26 +0100
Subject: [PATCH 19/25] fix tests

---
 polyply/tests/test_big_smile_mol_proc.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/polyply/tests/test_big_smile_mol_proc.py b/polyply/tests/test_big_smile_mol_proc.py
index 26e85ba6..28c5390d 100644
--- a/polyply/tests/test_big_smile_mol_proc.py
+++ b/polyply/tests/test_big_smile_mol_proc.py
@@ -1,5 +1,6 @@
 import pytest
 import networkx as nx
+from vermouth.forcefield import ForceField
 from polyply.src.big_smile_mol_processor import (DefBigSmileParser,
                                                  generate_edge)
 #import matplotlib.pyplot as plt
@@ -83,7 +84,8 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes):
 
 ))
 def test_def_big_smile_parser(smile, ref_nodes, ref_edges):
-    meta_mol = DefBigSmileParser().parse(smile)
+    ff = ForceField("new")
+    meta_mol = DefBigSmileParser(ff).parse(smile)
 #    nx.draw_networkx(meta_mol.molecule, with_labels=True, labels=nx.get_node_attributes(meta_mol.molecule, 'element'))
 #    plt.show()
     for node, ref in zip(meta_mol.nodes, ref_nodes):

From 2b9e7a9cdc414a3961efb802e667db74a7572a56 Mon Sep 17 00:00:00 2001
From: "Dr. Fabian Grunewald" <32294573+fgrunewald@users.noreply.github.com>
Date: Wed, 6 Mar 2024 19:16:53 +0100
Subject: [PATCH 20/25] Apply suggestions from code review

Co-authored-by: Peter C Kroon <pckroon@users.noreply.github.com>
---
 polyply/src/big_smile_mol_processor.py |  9 +++------
 polyply/src/big_smile_parsing.py       | 16 ++++++++--------
 2 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/polyply/src/big_smile_mol_processor.py b/polyply/src/big_smile_mol_processor.py
index 640c40e1..365b61bc 100644
--- a/polyply/src/big_smile_mol_processor.py
+++ b/polyply/src/big_smile_mol_processor.py
@@ -23,12 +23,9 @@ def compatible(left, right):
     """
     if left == right and left not in '> <':
         return True
-    if left[0] == "<" and right[0] == ">":
-        if left[1:] == right[1:]:
-            return True
-    if left[0] == ">" and right[0] == "<":
-        if left[1:] == right[1:]:
-            return True
+    l, r = left[0], right[0]
+    if (l, r) == ('<', '>') or (l, r) == ('>', '<'):
+        return left[1:] == right[1:]
     return False
 
 def generate_edge(source, target, bond_type="bonding"):
diff --git a/polyply/src/big_smile_parsing.py b/polyply/src/big_smile_parsing.py
index 55d3a6aa..90e171a3 100644
--- a/polyply/src/big_smile_parsing.py
+++ b/polyply/src/big_smile_parsing.py
@@ -3,10 +3,10 @@
 import numpy as np
 try:
     import pysmiles
-except ImportError:
+except ImportError as error:
     msg = ("You are using a functionality that requires "
            "the pysmiles package. Use pip install pysmiles ")
-    raise ImportError(msg)
+    raise ImportError(msg) from error
 import networkx as nx
 from vermouth.forcefield import ForceField
 from vermouth.molecule import Block
@@ -41,7 +41,7 @@ def res_pattern_to_meta_mol(pattern):
     '{' + [#resname_1][#resname_2]... + '}'
 
     In addition to plain enumeration any residue may be
-    followed by a '|' and an integern number that
+    followed by a '|' and an integer number that
     specifies how many times the given residue should
     be added within a sequence. For example, a pentamer
     of PEO can be written as:
@@ -52,10 +52,10 @@ def res_pattern_to_meta_mol(pattern):
 
     {[#PEO]|5}
 
-    The block syntax also applies to branches. Here the convetion
+    The block syntax also applies to branches. Here the convention
     is that the complete branch including it's first anchoring
     residue is repeated. For example, to generate a PMA-g-PEG
-    polymer the following syntax is permitted:
+    polymer containing 15 residues the following syntax is permitted:
 
     {[#PMA]([#PEO][#PEO])|5}
 
@@ -79,7 +79,7 @@ def res_pattern_to_meta_mol(pattern):
         if pattern[start-1] == '(':
             branching = True
             branch_anchor = prev_node
-            recipie = [(meta_mol.nodes[prev_node]['resname'], 1)]
+            recipe = [(meta_mol.nodes[prev_node]['resname'], 1)]
         if stop < len(pattern) and pattern[stop] == '|':
             eon = _find_next_character(pattern, ['[', ')', '(', '}'], stop)
             n_mon = int(pattern[stop+1:eon])
@@ -89,7 +89,7 @@ def res_pattern_to_meta_mol(pattern):
         resname = match.group(0)[2:-1]
         # collect all residues in branch
         if branching:
-            recipie.append((resname, n_mon))
+            recipe.append((resname, n_mon))
 
         # add the new residue
         connection = []
@@ -135,7 +135,7 @@ def tokenize_big_smile(big_smile):
     """
     Processes a BigSmile string by storing the
     the BigSmile specific bonding descriptors
-    in a dict with refernce to the atom they
+    in a dict with reference to the atom they
     refer to. Furthermore, a cleaned smile
     string is generated with the BigSmile
     specific syntax removed.

From b6d891f6f32bbc60ed96fd30d75953f717d21117 Mon Sep 17 00:00:00 2001
From: Fabian Gruenewald <f.grunewald@rug.nl>
Date: Thu, 7 Mar 2024 14:24:47 +0100
Subject: [PATCH 21/25] allow nested branch expansion

---
 polyply/src/big_smile_parsing.py | 81 ++++++++++++++++++++++++--------
 1 file changed, 62 insertions(+), 19 deletions(-)

diff --git a/polyply/src/big_smile_parsing.py b/polyply/src/big_smile_parsing.py
index 90e171a3..e396a5e3 100644
--- a/polyply/src/big_smile_parsing.py
+++ b/polyply/src/big_smile_parsing.py
@@ -24,6 +24,22 @@ def _find_next_character(string, chars, start):
             return idx+start
     return np.inf
 
+def _expand_branch(meta_mol, current, anchor, recipe):
+    prev_node = anchor
+    for bdx, (resname, n_mon) in enumerate(recipe):
+        if bdx == 0:
+            anchor = current
+        for _ in range(0, n_mon):
+            connection = [(prev_node, current)]
+            print(connection)
+            meta_mol.add_monomer(current,
+                                 resname,
+                                 connection)
+            prev_node = current
+            current += 1
+    prev_node = anchor
+    return meta_mol, current, prev_node
+
 def res_pattern_to_meta_mol(pattern):
     """
     Generate a :class:`polyply.MetaMolecule` from a
@@ -70,17 +86,30 @@ def res_pattern_to_meta_mol(pattern):
     """
     meta_mol = MetaMolecule()
     current = 0
-    branch_anchor = 0
+    # stores one or more branch anchors; each next
+    # anchor belongs to a nested branch
+    branch_anchor = []
+    # used for storing composition protocol for
+    # for branches; each entry is a list of
+    # branches from extending from the anchor
+    # point
+    recipes = defaultdict(list)
+    # the previous node
     prev_node = None
+    # do we have an open branch
     branching = False
     for match in re.finditer(PATTERNS['place_holder'], pattern):
         start, stop = match.span()
         # new branch here
         if pattern[start-1] == '(':
             branching = True
-            branch_anchor = prev_node
-            recipe = [(meta_mol.nodes[prev_node]['resname'], 1)]
+            branch_anchor.append(prev_node)
+            # the recipe for making the branch includes the anchor; which
+            # is hence the first atom in the list
+            if len(branch_anchor) == 1:
+                recipes[branch_anchor[-1]] = [(meta_mol.nodes[prev_node]['resname'], 1)]
         if stop < len(pattern) and pattern[stop] == '|':
+            # eon => end of next
             eon = _find_next_character(pattern, ['[', ')', '(', '}'], stop)
             n_mon = int(pattern[stop+1:eon])
         else:
@@ -89,7 +118,7 @@ def res_pattern_to_meta_mol(pattern):
         resname = match.group(0)[2:-1]
         # collect all residues in branch
         if branching:
-            recipe.append((resname, n_mon))
+            recipes[branch_anchor[-1]].append((resname, n_mon))
 
         # add the new residue
         connection = []
@@ -105,26 +134,40 @@ def res_pattern_to_meta_mol(pattern):
         # terminate branch and jump back to anchor
         branch_stop = _find_next_character(pattern, ['['], stop) >\
                       _find_next_character(pattern, [')'], stop)
-        if stop <= len(pattern) and branch_stop and branching:
+
+        if stop <= len(pattern) and branch_stop: # and branching:
             branching = False
-            prev_node = branch_anchor
+            prev_node = branch_anchor.pop()
+            if branch_anchor:
+                branching = True
             # we have to multiply the branch n-times
             eon_a = _find_next_character(pattern, [')'], stop)
             if stop+1 < len(pattern) and pattern[eon_a+1] == "|":
                 eon_b = _find_next_character(pattern, ['[', ')', '(', '}'], eon_a+1)
-                # -1 because one branch has already been added at this point
-                for _ in range(0,int(pattern[eon_a+2:eon_b])-1):
-                    for bdx, (resname, n_mon) in enumerate(recipie):
-                        if bdx == 0:
-                            anchor = current
-                        for _ in range(0, n_mon):
-                            connection = [(prev_node, current)]
-                            meta_mol.add_monomer(current,
-                                                 resname,
-                                                 connection)
-                            prev_node = current
-                            current += 1
-                    prev_node = anchor
+                # the outermost loop goes over how often a the branch has to be
+                # added to the existing sequence
+                for idx in range(0,int(pattern[eon_a+2:eon_b])-1):
+                    prev_anchor = None
+                    skip = 0
+                    for ref_anchor, recipe in list(recipes.items())[len(branch_anchor):]:
+                        print("-->", recipe)
+                        if prev_anchor:
+                            offset = ref_anchor - prev_anchor
+                            prev_node = prev_node + offset
+                            #skip = 1
+                        print(prev_node)
+                        meta_mol, current, prev_node = _expand_branch(meta_mol,
+                                                                      current=current,
+                                                                      anchor=prev_node,
+                                                                      recipe=recipe) #[skip:])
+                        if prev_anchor is None:
+                            base_anchor = prev_node
+                        prev_anchor = ref_anchor
+                print(base_anchor)
+                prev_node = base_anchor
+            # if all branches are done we need to reset the lists
+         #   branch_anchor = []
+         #   recipes = defaultdict(list)
     return meta_mol
 
 def _big_smile_iter(smile):

From a867329a82d2f4988e43839b84357032841534a4 Mon Sep 17 00:00:00 2001
From: Fabian Gruenewald <f.grunewald@rug.nl>
Date: Thu, 7 Mar 2024 14:56:16 +0100
Subject: [PATCH 22/25] test branch expansion

---
 polyply/src/big_smile_parsing.py        | 17 +++---
 polyply/tests/test_big_smile_parsing.py | 69 +++++++++++++++++++++++--
 2 files changed, 73 insertions(+), 13 deletions(-)

diff --git a/polyply/src/big_smile_parsing.py b/polyply/src/big_smile_parsing.py
index e396a5e3..d93265ec 100644
--- a/polyply/src/big_smile_parsing.py
+++ b/polyply/src/big_smile_parsing.py
@@ -31,7 +31,6 @@ def _expand_branch(meta_mol, current, anchor, recipe):
             anchor = current
         for _ in range(0, n_mon):
             connection = [(prev_node, current)]
-            print(connection)
             meta_mol.add_monomer(current,
                                  resname,
                                  connection)
@@ -106,8 +105,8 @@ def res_pattern_to_meta_mol(pattern):
             branch_anchor.append(prev_node)
             # the recipe for making the branch includes the anchor; which
             # is hence the first atom in the list
-            if len(branch_anchor) == 1:
-                recipes[branch_anchor[-1]] = [(meta_mol.nodes[prev_node]['resname'], 1)]
+            #if len(branch_anchor) == 1:
+            recipes[branch_anchor[-1]] = [(meta_mol.nodes[prev_node]['resname'], 1)]
         if stop < len(pattern) and pattern[stop] == '|':
             # eon => end of next
             eon = _find_next_character(pattern, ['[', ')', '(', '}'], stop)
@@ -150,24 +149,22 @@ def res_pattern_to_meta_mol(pattern):
                     prev_anchor = None
                     skip = 0
                     for ref_anchor, recipe in list(recipes.items())[len(branch_anchor):]:
-                        print("-->", recipe)
                         if prev_anchor:
                             offset = ref_anchor - prev_anchor
                             prev_node = prev_node + offset
-                            #skip = 1
-                        print(prev_node)
+                            skip = 1
                         meta_mol, current, prev_node = _expand_branch(meta_mol,
                                                                       current=current,
                                                                       anchor=prev_node,
-                                                                      recipe=recipe) #[skip:])
+                                                                      recipe=recipe[skip:])
                         if prev_anchor is None:
                             base_anchor = prev_node
                         prev_anchor = ref_anchor
-                print(base_anchor)
                 prev_node = base_anchor
             # if all branches are done we need to reset the lists
-         #   branch_anchor = []
-         #   recipes = defaultdict(list)
+            # when all nested branches are completed
+            if len(branch_anchor) == 0:
+                recipes = defaultdict(list)
     return meta_mol
 
 def _big_smile_iter(smile):
diff --git a/polyply/tests/test_big_smile_parsing.py b/polyply/tests/test_big_smile_parsing.py
index ba3f5f69..5c1491b8 100644
--- a/polyply/tests/test_big_smile_parsing.py
+++ b/polyply/tests/test_big_smile_parsing.py
@@ -22,11 +22,13 @@
                         ["PMA", "PMA", "PMA"],
                         [(0, 1), (1, 2)]),
                         # simple branch expension
-                        ("{[#PMA]([#PEO][#PEO][#OHter])|2}",
+                        ("{[#PMA]([#PEO][#PEO][#OHter])|3}",
                         ["PMA", "PEO", "PEO", "OHter",
+                         "PMA", "PEO", "PEO", "OHter",
                          "PMA", "PEO", "PEO", "OHter"],
                         [(0, 1), (1, 2), (2, 3),
-                         (0, 4), (4, 5), (5, 6), (6, 7)]
+                         (0, 4), (4, 5), (5, 6), (6, 7),
+                         (4, 8), (8, 9), (9, 10), (10, 11)]
                          ),
                         # nested branched with expansion
                         ("{[#PMA]([#PEO]|3)|2}",
@@ -34,7 +36,68 @@
                          "PMA", "PEO", "PEO", "PEO"],
                         [(0, 1), (1, 2), (2, 3),
                          (0, 4), (4, 5), (5, 6), (6, 7)]
-                         )
+                         ),
+                        # nested braching
+                        #     0     1      2    3      4      5    6
+                        ("{[#PMA][#PMA]([#PEO][#PEO]([#OH])[#PEO])[#PMA]}",
+                        ["PMA", "PMA", "PEO", "PEO", "OH",
+                         "PEO", "PMA"],
+                        [(0, 1), (1, 2), (2, 3),
+                         (3, 4), (3, 5), (1, 6)]
+                         ),
+                        # nested braching plus expansion
+                        #     0     1      2    3      4/5      6     7
+                        ("{[#PMA][#PMA]([#PEO][#PEO]([#OH]|2)[#PEO])[#PMA]}",
+                        ["PMA", "PMA", "PEO", "PEO", "OH", "OH",
+                         "PEO", "PMA"],
+                        [(0, 1), (1, 2), (2, 3),
+                         (3, 4), (4, 5), (3, 6), (1, 7)]
+                         ),
+                        # nested braching plus expansion incl. branch
+                        #     0     1      2    3      4      5
+                        #           6      7    8      9      10      11
+                        ("{[#PMA][#PMA]([#PEO][#PEO]([#OH])[#PEO])|2[#PMA]}",
+                        ["PMA", "PMA", "PEO", "PEO", "OH", "PEO",
+                         "PMA", "PEO", "PEO", "PEO", "OH", "PMA"],
+                        [(0, 1), (1, 2), (2, 3),
+                         (3, 4), (3, 5), (1, 6), (6, 7), (7, 8),
+                         (8, 9), (8, 10), (6, 11)]
+                         ),
+                        # nested braching plus expansion of nested branch
+                        # here the nested branch is expended
+                        #  0 - 1 - 10
+                        #      |
+                        #      2
+                        #      |
+                        #      3 {- 5 - 7 } - 9 -> the expanded fragment
+                        #      |    |   |
+                        #      4    6   8
+                        ("{[#PMA][#PMA]([#PEO][#PQ]([#OH])|3[#PEO])[#PMA]}",
+                        ["PMA", "PMA", "PEO", "PQ", "OH",
+                         "PQ", "OH", "PQ", "OH", "PEO", "PMA"],
+                        [(0, 1), (1, 2), (1, 10),
+                         (2, 3), (3, 4), (3, 5), (5, 6),
+                         (5, 7), (7, 8), (7, 9)]
+                         ),
+                        # nested braching plus expansion of nested branch
+                        # here the nested branch is expended and a complete
+                        # new branch is added
+                        #          11   13
+                        #           |    |
+                        #  0 - 1 - 10 - 12
+                        #      |
+                        #      2
+                        #      |
+                        #      3 {- 5 - 7 } - 9 -> the expanded fragment
+                        #      |    |   |
+                        #      4    6   8
+                        ("{[#PMA][#PMA]([#PEO][#PQ]([#OH])|3[#PEO])[#PMA]([#CH3])|2}",
+                        ["PMA", "PMA", "PEO", "PQ", "OH",
+                         "PQ", "OH", "PQ", "OH", "PEO", "PMA", "CH3", "PMA", "CH3"],
+                        [(0, 1), (1, 2), (1, 10),
+                         (2, 3), (3, 4), (3, 5), (5, 6),
+                         (5, 7), (7, 8), (7, 9), (10, 11), (10, 12), (12, 13)]
+                         ),
 ))
 def test_res_pattern_to_meta_mol(smile, nodes, edges):
     """

From b6f5cc0d4a101948ca853cf1226afb598f6b96f3 Mon Sep 17 00:00:00 2001
From: Fabian Gruenewald <f.grunewald@rug.nl>
Date: Thu, 7 Mar 2024 15:19:09 +0100
Subject: [PATCH 23/25] add comments all over residue expansion functions

---
 polyply/src/big_smile_parsing.py | 62 +++++++++++++++++++++++++++-----
 1 file changed, 54 insertions(+), 8 deletions(-)

diff --git a/polyply/src/big_smile_parsing.py b/polyply/src/big_smile_parsing.py
index d93265ec..b043ebd9 100644
--- a/polyply/src/big_smile_parsing.py
+++ b/polyply/src/big_smile_parsing.py
@@ -97,29 +97,42 @@ def res_pattern_to_meta_mol(pattern):
     prev_node = None
     # do we have an open branch
     branching = False
+    # each element in the for loop matches a pattern
+    # '[' + '#' + some alphanumeric name + ']'
     for match in re.finditer(PATTERNS['place_holder'], pattern):
         start, stop = match.span()
-        # new branch here
+        # we start a new branch when the residue is preceded by '('
+        # as in ... ([#PEO] ...
         if pattern[start-1] == '(':
             branching = True
             branch_anchor.append(prev_node)
             # the recipe for making the branch includes the anchor; which
-            # is hence the first atom in the list
-            #if len(branch_anchor) == 1:
+            # is hence the first residue in the list
             recipes[branch_anchor[-1]] = [(meta_mol.nodes[prev_node]['resname'], 1)]
+        # here we check if the atom is followed by a expansion character '|'
+        # as in ... [#PEO]|
         if stop < len(pattern) and pattern[stop] == '|':
             # eon => end of next
+            # we find the next character that starts a new residue, ends a branch or
+            # ends the complete pattern
             eon = _find_next_character(pattern, ['[', ')', '(', '}'], stop)
+            # between the expansion character and the eon character
+            # is any number that correspnds to the number of times (i.e. monomers)
+            # that this atom should be added
             n_mon = int(pattern[stop+1:eon])
         else:
             n_mon = 1
 
+        # the resname starts at the second character and ends
+        # one before the last according to the above pattern
         resname = match.group(0)[2:-1]
-        # collect all residues in branch
+        # if this residue is part of a branch we store it in
+        # the recipe dict together with the anchor residue
+        # and expansion number
         if branching:
             recipes[branch_anchor[-1]].append((resname, n_mon))
 
-        # add the new residue
+        # new we add new residue as often as required
         connection = []
         for _ in range(0, n_mon):
             if prev_node is not None:
@@ -130,36 +143,69 @@ def res_pattern_to_meta_mol(pattern):
             prev_node = current
             current += 1
 
-        # terminate branch and jump back to anchor
+        # here we check if the residue considered before is the
+        # last residue of a branch (i.e. '...[#residue])'
+        # that is the case if the branch closure comes before
+        # any new atom begins
         branch_stop = _find_next_character(pattern, ['['], stop) >\
                       _find_next_character(pattern, [')'], stop)
 
-        if stop <= len(pattern) and branch_stop: # and branching:
+        # if the branch ends we reset the anchor
+        # and set branching False unless we are in
+        # a nested branch
+        if stop <= len(pattern) and branch_stop:
             branching = False
             prev_node = branch_anchor.pop()
             if branch_anchor:
                 branching = True
-            # we have to multiply the branch n-times
+            #========================================
+            #       expansion for branches
+            #========================================
+            # We need to know how often the branch has
+            # to be added so we first identify the branch
+            # terminal character ')' called eon_a.
             eon_a = _find_next_character(pattern, [')'], stop)
+            # Then we check if the expansion character
+            # is next.
             if stop+1 < len(pattern) and pattern[eon_a+1] == "|":
+                # If there is one we find the beginning
+                # of the next branch, residue or end of the string
+                # As before all characters inbetween are a number that
+                # is how often the branch is expanded.
                 eon_b = _find_next_character(pattern, ['[', ')', '(', '}'], eon_a+1)
                 # the outermost loop goes over how often a the branch has to be
                 # added to the existing sequence
                 for idx in range(0,int(pattern[eon_a+2:eon_b])-1):
                     prev_anchor = None
                     skip = 0
+                    # in principle each branch can contain any number of nested branches
+                    # each branch is itself a recipe that has an anchor atom
                     for ref_anchor, recipe in list(recipes.items())[len(branch_anchor):]:
+                        # starting from the first nested branch we have to do some
+                        # math to find the anchor atom relative to the first branch
+                        # we also skip the first residue in recipe, which is the
+                        # anchor residue. Only the outermost branch in an expansion
+                        # is expanded including the anchor. This allows easy description
+                        # of graft polymers.
                         if prev_anchor:
                             offset = ref_anchor - prev_anchor
                             prev_node = prev_node + offset
                             skip = 1
+                        # this function simply adds the residues of the paticular
+                        # branch
                         meta_mol, current, prev_node = _expand_branch(meta_mol,
                                                                       current=current,
                                                                       anchor=prev_node,
                                                                       recipe=recipe[skip:])
+                        # if this is the first branch we want to set the anchor
+                        # as the base anchor to which we jump back after all nested
+                        # branches have been added
                         if prev_anchor is None:
                             base_anchor = prev_node
+                        # store the previous anchor so we can do the math for nested
+                        # branches
                         prev_anchor = ref_anchor
+                # all branches added; then go back to the base anchor
                 prev_node = base_anchor
             # if all branches are done we need to reset the lists
             # when all nested branches are completed

From f965e1d42c6b3afb41b690407c36aeee0a4e8493 Mon Sep 17 00:00:00 2001
From: Fabian Gruenewald <f.grunewald@rug.nl>
Date: Thu, 7 Mar 2024 15:33:45 +0100
Subject: [PATCH 24/25] address comments

---
 polyply/src/big_smile_mol_processor.py   |  8 ++++----
 polyply/src/big_smile_parsing.py         | 12 ++++--------
 polyply/tests/test_big_smile_mol_proc.py |  2 +-
 3 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/polyply/src/big_smile_mol_processor.py b/polyply/src/big_smile_mol_processor.py
index 365b61bc..e706217a 100644
--- a/polyply/src/big_smile_mol_processor.py
+++ b/polyply/src/big_smile_mol_processor.py
@@ -28,7 +28,7 @@ def compatible(left, right):
         return left[1:] == right[1:]
     return False
 
-def generate_edge(source, target, bond_type="bonding"):
+def generate_edge(source, target, bond_attribute="bonding"):
     """
     Given a source and a target graph, which have bonding
     descriptors stored as node attributes, find a pair of
@@ -41,7 +41,7 @@ def generate_edge(source, target, bond_type="bonding"):
     ----------
     source: :class:`nx.Graph`
     target: :class:`nx.Graph`
-    bond_type: `abc.hashable`
+    bond_attribute: `abc.hashable`
         under which attribute are the bonding descriptors
         stored.
 
@@ -55,8 +55,8 @@ def generate_edge(source, target, bond_type="bonding"):
     LookupError
         if no match is found
     """
-    source_nodes = nx.get_node_attributes(source, bond_type)
-    target_nodes = nx.get_node_attributes(target, bond_type)
+    source_nodes = nx.get_node_attributes(source, bond_attribute)
+    target_nodes = nx.get_node_attributes(target, bond_attribute)
     for source_node in source_nodes:
         for target_node in target_nodes:
             #print(source_node, target_node)
diff --git a/polyply/src/big_smile_parsing.py b/polyply/src/big_smile_parsing.py
index b043ebd9..d591eecd 100644
--- a/polyply/src/big_smile_parsing.py
+++ b/polyply/src/big_smile_parsing.py
@@ -213,10 +213,6 @@ def res_pattern_to_meta_mol(pattern):
                 recipes = defaultdict(list)
     return meta_mol
 
-def _big_smile_iter(smile):
-    for token in smile:
-        yield token
-
 def tokenize_big_smile(big_smile):
     """
     Processes a BigSmile string by storing the
@@ -229,17 +225,17 @@ def tokenize_big_smile(big_smile):
     Parameters
     ----------
     smile: str
-        a BigSmile smile string
+        a BigSmile smiles string
 
     Returns
     -------
     str
-        a canonical smile string
+        a canonical smiles string
     dict
         a dict mapping bonding descriptors
-        to the nodes within the smile
+        to the nodes within the smiles string
     """
-    smile_iter = _big_smile_iter(big_smile)
+    smile_iter = iter(big_smile)
     bonding_descrpt = defaultdict(list)
     smile = ""
     node_count = 0
diff --git a/polyply/tests/test_big_smile_mol_proc.py b/polyply/tests/test_big_smile_mol_proc.py
index 28c5390d..c40f96bd 100644
--- a/polyply/tests/test_big_smile_mol_proc.py
+++ b/polyply/tests/test_big_smile_mol_proc.py
@@ -38,7 +38,7 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes):
     target = nx.path_graph(4)
     nx.set_node_attributes(source, bonds_source, "bonding")
     nx.set_node_attributes(target, bonds_target, "bonding")
-    new_edge, new_btypes = generate_edge(source, target, bond_type="bonding")
+    new_edge, new_btypes = generate_edge(source, target, bond_attribute="bonding")
     assert new_edge == edge
     assert new_btypes == btypes
 

From 0335956072b84fcc8d59f2d9b6264917ce971879 Mon Sep 17 00:00:00 2001
From: Fabian Gruenewald <f.grunewald@rug.nl>
Date: Thu, 7 Mar 2024 16:00:29 +0100
Subject: [PATCH 25/25] allow for ionic bonds with . syntax

---
 polyply/src/big_smile_mol_processor.py   |  6 +++++-
 polyply/src/big_smile_parsing.py         |  4 ++++
 polyply/tests/test_big_smile_mol_proc.py | 10 ++++++++++
 3 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/polyply/src/big_smile_mol_processor.py b/polyply/src/big_smile_mol_processor.py
index e706217a..1801a437 100644
--- a/polyply/src/big_smile_mol_processor.py
+++ b/polyply/src/big_smile_mol_processor.py
@@ -1,3 +1,4 @@
+import re
 import networkx as nx
 import pysmiles
 from polyply.src.big_smile_parsing import (res_pattern_to_meta_mol,
@@ -130,10 +131,13 @@ def replace_unconsumed_bonding_descrpt(self):
                     self.meta_molecule.molecule.nodes[new_node].update(attrs)
 
     def parse(self, big_smile_str):
-        res_pattern, residues = big_smile_str.split('.')
+        res_pattern, residues = re.findall(r"\{[^\}]+\}", big_smile_str)
         self.meta_molecule = res_pattern_to_meta_mol(res_pattern)
         self.force_field = force_field_from_fragments(residues)
         MapToMolecule(self.force_field).run_molecule(self.meta_molecule)
         self.edges_from_bonding_descrpt()
         self.replace_unconsumed_bonding_descrpt()
         return self.meta_molecule
+
+# ToDo
+# - clean copying of bond-list attributes L100
diff --git a/polyply/src/big_smile_parsing.py b/polyply/src/big_smile_parsing.py
index d591eecd..ec136bea 100644
--- a/polyply/src/big_smile_parsing.py
+++ b/polyply/src/big_smile_parsing.py
@@ -361,3 +361,7 @@ def force_field_from_fragments(fragment_str):
         mol_block = Block(mol_graph)
         force_field.blocks[resname] = mol_block
     return force_field
+
+# ToDos
+# - remove special case hydrogen line 327ff
+# - check rebuild_h and clean up
diff --git a/polyply/tests/test_big_smile_mol_proc.py b/polyply/tests/test_big_smile_mol_proc.py
index c40f96bd..b6fe8e03 100644
--- a/polyply/tests/test_big_smile_mol_proc.py
+++ b/polyply/tests/test_big_smile_mol_proc.py
@@ -53,6 +53,16 @@ def test_generate_edge(bonds_source, bonds_target, edge, btypes):
                         [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7),
                          (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13),
                          (11, 14), (11, 15), (11, 16), (16, 17)]),
+                        # smiple linear seqeunce with ionic bond
+                        ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]COC[$],#OHter=[$][O].[Na+]}",
+                        #           0 1             2 3 4 5 6 7 8
+                        [('OHter', 'O Na'), ('PEO', 'C O C H H H H'),
+                        #        9 10 11 12 13 14 15         16 17
+                         ('PEO', 'C O C H H H H'), ('OHter', 'O Na')],
+                        [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7),
+                         (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13),
+                         (11, 14), (11, 15), (11, 16), (16, 17)]),
+
                         # uncomsumed bonding IDs; note that this is not the same
                         # molecule as previous test case. Here one of the OH branches
                         # and replaces an CH2 group with CH-OH