From d627ac22f171f398bb630fd6a0b462a76b74ea41 Mon Sep 17 00:00:00 2001
From: Fabian Gruenewald <f.grunewald@rug.nl>
Date: Mon, 14 Oct 2024 18:37:10 +0200
Subject: [PATCH 01/16] implement weight annotation

---
 cgsmiles/pysmiles_utils.py              |  5 ++
 cgsmiles/read_fragments.py              | 42 +++++++++++++--
 cgsmiles/tests/test_molecule_resolve.py | 68 +++++++++++++++++--------
 3 files changed, 91 insertions(+), 24 deletions(-)

diff --git a/cgsmiles/pysmiles_utils.py b/cgsmiles/pysmiles_utils.py
index 83051bd..419efb3 100644
--- a/cgsmiles/pysmiles_utils.py
+++ b/cgsmiles/pysmiles_utils.py
@@ -83,6 +83,11 @@ def rebuild_h_atoms(mol_graph, keep_bonding=False):
             ref_node = next(mol_graph.neighbors(node))
             mol_graph.nodes[node]["fragid"] = mol_graph.nodes[ref_node]["fragid"]
             mol_graph.nodes[node]["fragname"] = mol_graph.nodes[ref_node]["fragname"]
+        if mol_graph.nodes[node].get("element", "*") == "H":
+            # make sure the weights are copied for implicit h-atoms
+            anchor = list(mol_graph.neighbors(node))[0]
+            weight = mol_graph.nodes[anchor].get("weight", 1)
+            mol_graph.nodes[node]["weight"] = weight
 
 def annotate_ez_isomers(molecule):
     """
diff --git a/cgsmiles/read_fragments.py b/cgsmiles/read_fragments.py
index f065317..fa28829 100644
--- a/cgsmiles/read_fragments.py
+++ b/cgsmiles/read_fragments.py
@@ -95,6 +95,31 @@ def collect_ring_number(smile_iter, token, node_count, rings):
 
     return smile_iter, token, partial_str, rings
 
+def get_weight(smile_iter):
+    """
+    Extracts weights given to atoms/nodes in
+    fragments. The iter should be advanced
+    up to the weight marker ;.
+
+    Parameters
+    ----------
+    smile_iter: class.PeekIter
+
+    Returns
+    -------
+    float:
+        the weight
+    PeekIter
+        the advanced iter object
+    """
+    num = []
+    for digit in smile_iter:
+        num.append(digit)
+        if smile_iter.peek() in [']', '@', 'H']:
+            break
+    out = float("".join(num))
+    return out, smile_iter
+
 def strip_bonding_descriptors(fragment_string):
     """
     Processes a CGSmiles fragment string by
@@ -122,6 +147,7 @@ def strip_bonding_descriptors(fragment_string):
     rings = defaultdict(list)
     ez_isomer_atoms = {}
     rs_isomers = {}
+    weights = {}
     smile = ""
     node_count = 0
     prev_node = 0
@@ -147,6 +173,8 @@ def strip_bonding_descriptors(fragment_string):
                 bonding_descrpt[prev_node].append(bond_descrp + str(order))
             else:
                 atom = token
+                # set the default weight
+                weights[node_count] = 1
                 while peek != ']':
                     # deal with rs chirality
                     if peek == '@':
@@ -154,6 +182,10 @@ def strip_bonding_descriptors(fragment_string):
                         if smile_iter.peek() == '@':
                             chiral_token = '@' + next(smile_iter)
                         rs_isomers[node_count] = (chiral_token, [])
+                    # we have weights
+                    elif peek == ';':
+                        weight, smile_iter = get_weight(smile_iter)
+                        weights[node_count] = weight
                     else:
                         atom += peek
                     peek = next(smile_iter)
@@ -193,6 +225,8 @@ def strip_bonding_descriptors(fragment_string):
                 smile += token
             current_order = None
             prev_node = node_count
+            # set default weight
+            weights[node_count] = 1
             node_count += 1
 
     # we need to annotate rings to the chiral isomers
@@ -201,7 +235,8 @@ def strip_bonding_descriptors(fragment_string):
             if node in ring_nodes:
                 bonded_node = _find_bonded_ring_node(ring_nodes, node)
                 rs_isomers[node][1].append(bonded_node)
-    return smile, bonding_descrpt, rs_isomers, ez_isomer_atoms
+
+    return smile, bonding_descrpt, rs_isomers, ez_isomer_atoms, weights
 
 def fragment_iter(fragment_str, all_atom=True):
     """
@@ -230,8 +265,8 @@ def fragment_iter(fragment_str, all_atom=True):
     for fragment in fragment_str[1:-1].split(','):
         delim = fragment.find('=', 0)
         fragname = fragment[1:delim]
-        big_smile = fragment[delim+1:]
-        smile, bonding_descrpt, rs_isomers, ez_isomers = strip_bonding_descriptors(big_smile)
+        frag_smile = fragment[delim+1:]
+        smile, bonding_descrpt, rs_isomers, ez_isomers, weights = strip_bonding_descriptors(frag_smile)
         if smile == "H":
             mol_graph = nx.Graph()
             mol_graph.add_node(0, element="H", bonding=bonding_descrpt[0])
@@ -258,6 +293,7 @@ def fragment_iter(fragment_str, all_atom=True):
 
         nx.set_node_attributes(mol_graph, fragname, 'fragname')
         nx.set_node_attributes(mol_graph, 0, 'fragid')
+        nx.set_node_attributes(mol_graph, weights, 'weight')
         yield fragname, mol_graph
 
 def read_fragments(fragment_str, all_atom=True, fragment_dict=None):
diff --git a/cgsmiles/tests/test_molecule_resolve.py b/cgsmiles/tests/test_molecule_resolve.py
index afdbf89..14b1d70 100644
--- a/cgsmiles/tests/test_molecule_resolve.py
+++ b/cgsmiles/tests/test_molecule_resolve.py
@@ -40,7 +40,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
     assert new_btypes == btypes
 
 
-@pytest.mark.parametrize('smile, ref_frags, elements, ref_edges, chiral, ez',(
+@pytest.mark.parametrize('smile, ref_frags, elements, ref_edges, chiral, ez, weights',(
                         # smiple linear seqeunce
                         ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]COC[$],#OHter=[$][O]}",
                         #           0 1             2 3 4 5 6 7 8
@@ -51,7 +51,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
                         [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7),
                          (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13),
                          (11, 14), (11, 15), (11, 16), (16, 17)],
-                        {}, {}),
+                        {}, {}, {}),
                         # smiple linear seqeunce with bond-order in link
                         ("{[#TC1][#TC4][#TC1]}.{#TC1=[$1]=CC=[$2],#TC4=[$1]=CC=[$2]}",
                         #         0 1 2 3 4 5            6 7 8 9
@@ -61,7 +61,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
                         'C C H H H H C C H H C C H H H H',
                         [(0, 1), (0, 2), (1, 3), (1, 4), (1, 5), (0, 6), (6, 7),
                          (6, 8), (7, 9), (7, 11), (10, 11), (10, 12), (10, 13),
-                         (10, 14), (11, 15)], {}, {}),
+                         (10, 14), (11, 15)], {}, {}, {}),
                         # smiple linear seqeunce unconsumed bonding descrpt
                         ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]CO[>]C[$],#OHter=[$][O]}",
                         #           0 1             2 3 4 5 6 7 8
@@ -71,7 +71,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
                         'O H C O C H H H H C O C H H H H O H',
                         [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7),
                          (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13),
-                         (11, 14), (11, 15), (11, 16), (16, 17)], {}, {}),
+                         (11, 14), (11, 15), (11, 16), (16, 17)], {}, {}, {}),
                         # smiple linear seqeunce with ionic bond
                         ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]COC[$],#OHter=[$][O-].[Na+]}",
                         #           0 1             2 3 4 5 6 7 8
@@ -81,7 +81,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
                         'O Na C O C H H H H C O C H H H H O Na',
                         [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7),
                          (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13),
-                         (11, 14), (11, 15), (11, 16), (16, 17)], {}, {}),
+                         (11, 14), (11, 15), (11, 16), (16, 17)], {}, {}, {}),
                         # smiple linear seqeunce with ionic ending
                         ("{[#OH][#PEO]|2[#ON]}.{#PEO=[$]COC[$],#OH=[$]O,#ON=[$][O-]}",
                         #           0 1             2 3 4 5 6 7 8
@@ -91,7 +91,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
                         'O H C O C H H H H C O C H H H H O',
                         [(0, 1), (0, 2), (2, 3), (3, 4), (2, 5), (2, 6), (4, 7),
                          (4, 8), (4, 9), (9, 10), (10, 11), (9, 12), (9, 13),
-                         (11, 14), (11, 15), (11, 16)], {}, {}),
+                         (11, 14), (11, 15), (11, 16)], {}, {}, {}),
                         # uncomsumed bonding IDs; note that this is not the same
                         # molecule as previous test case. Here one of the OH branches
                         # and replaces an CH2 group with CH-OH
@@ -103,7 +103,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
                         'O H C O C H H H H C O C H H H H O H',
                         [(0, 1), (0, 2), (2, 3), (2, 5), (2, 11), (3, 4),
                          (4, 6), (4, 7), (4, 8), (9, 10), (9, 12), (9, 13),
-                         (10, 11), (11, 15), (11, 14), (9, 16), (16, 17)], {}, {}),
+                         (10, 11), (11, 15), (11, 14), (9, 16), (16, 17)], {}, {}, {}),
                         # simple branched sequence
                         ("{[#Hter][#PE]([#PEO][#Hter])[#PE]([#PEO][#Hter])[#Hter]}.{#Hter=[$]H,#PE=[$]CC[$][$],#PEO=[$]COC[$]}",
                         [('Hter', 'H'), ('PE', 'C C H H H'), ('PEO', 'C O C H H H H'), ('Hter', 'H'),
@@ -111,7 +111,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
                         'H C C H H H C O C H H H H H C C H H H C O C H H H H H H',
                         [(0, 1), (1, 2), (1, 3), (1, 4), (2, 5), (2, 6), (2, 14), (6, 7), (6, 9), (6, 10), (7, 8),
                          (8, 11), (8, 12), (8, 13), (14, 15), (14, 16), (14, 17), (15, 18), (15, 19), (15, 27),
-                         (19, 20), (19, 22), (19, 23), (20, 21), (21, 24), (21, 25), (21, 26)], {}, {}),
+                         (19, 20), (19, 22), (19, 23), (20, 21), (21, 24), (21, 25), (21, 26)], {}, {}, {}),
                         # something with a ring
                         #            012 34567
                         #            890123456
@@ -124,7 +124,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
                          (6, 14), (7, 8), (7, 15), (8, 16), (17, 18), (17, 25),
                          (17, 26), (18, 19), (18, 27), (18, 33), (19, 20), (19, 24),
                          (20, 21), (20, 28), (21, 22), (21, 29), (22, 23), (22, 30),
-                         (23, 24), (23, 31), (24, 32)], {}, {}),
+                         (23, 24), (23, 31), (24, 32)], {}, {}, {}),
                         # something more complicated branched
                         # here we have multiple bonding descriptors
 #                       # despite being the same residue we have 3 fragments after adding hydrgens
@@ -146,7 +146,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
                         [('A', 'O H C H H'), ('B', 'C H H C H H H'),],
                         'O H C H H C H H H',
                         [(0, 1), (0, 2), (2, 3), (2, 4), (2, 5),
-                         (5, 6), (5, 7), (5, 8)], {}, {}),
+                         (5, 6), (5, 7), (5, 8)], {}, {}, {}),
                         # smiple squash operator; unconsumed operators
                         ("{[#A][#B]}.{#A=OC[!],#B=[$][!]CC}",
                         #       0 1 2 3 4           1 5 3 4 6 7 8
@@ -157,7 +157,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
                         [('A', 'O H C H H'), ('B', 'C H H C H H H'),],
                         'O H C H H C H H H',
                         [(0, 1), (0, 2), (2, 3), (2, 4), (2, 5),
-                         (5, 6), (5, 7), (5, 8)], {}, {}),
+                         (5, 6), (5, 7), (5, 8)], {}, {}, {}),
                         # smiple squash operator; plus connect operator
                         ("{[#A][#B][#C]}.{#A=OC[!],#B=[$][!]CC,#C=[$]O}",
                         #       0 1 2 3 4           1 5 3 4 6 7 8
@@ -168,7 +168,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
                         [('A', 'O H C H'), ('B', 'C H C H H H'), ('C', 'O H')],
                         'O H C H C H H H O H',
                         [(0, 1), (0, 2), (2, 3), (2, 4),
-                         (4, 5), (4, 6), (4, 7), (2, 8), (8, 9)], {}, {}),
+                         (4, 5), (4, 6), (4, 7), (2, 8), (8, 9)], {}, {}, {}),
                         # THF like test case with double edge and squash operator
                         ("{[#A]=[#B]}.{#A=[!]COC[!],#B=[!]CCCC[!]}",
                         [('A', 'O C C H H H H'),
@@ -176,7 +176,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
                         'O C C H H H H C C H H H H',
                         [(0, 2), (0, 3), (2, 4), (2, 5),
                          (3, 6), (3, 7), (2, 8), (3, 9),
-                         (8, 9), (9, 12), (9, 13), (8, 10), (8, 11)], {}, {}),
+                         (8, 9), (9, 12), (9, 13), (8, 10), (8, 11)], {}, {}, {}),
                         # Toluene like test case with squash operator and aromaticity
                         ("{[#SC3]1[#TC5][#TC5]1}.{#SC3=Cc(c[!])c[!],#TC5=[!]ccc[!]}",
                         [('SC3', 'C C H H H C H C H'),
@@ -184,7 +184,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
                         'C C H H H C H C H C H C H C H',
                         [(0, 1), (0, 2), (0, 3), (0, 4), (1, 5),
                          (1, 7), (5, 9), (5, 6), (7, 13), (7, 8),
-                         (9, 11), (9, 10), (11, 13), (11, 12), (13, 14)], {}, {}),
+                         (9, 11), (9, 10), (11, 13), (11, 12), (13, 14)], {}, {}, {}),
                          # simple chirality assigment with rings
                          ("{[#GLC]}.{#GLC=C([C@@H]1[C@H]([C@@H]([C@H]([C@H](O1)O)O)O)O)O}",
                          # 0 1 2 3
@@ -194,7 +194,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
                           (2, 15), (3, 4), (3, 9), (3, 16), (4, 5), (4, 8), (4, 17), (5, 6), (5, 7), (5, 18),
                           (7, 19), (8, 20), (9, 21), (10, 22), (11, 23)],
                          {1: (6, 14, 2, 0), 2: (1, 15, 3, 10), 3: (2, 16, 9, 4),
-                          4: (3, 17, 5, 8), 5: (4, 18, 6, 7)}, {}),
+                          4: (3, 17, 5, 8), 5: (4, 18, 6, 7)}, {}, {}),
                         # simple chirality assigment between fragments
                         ("{[#A][#B][#C]}.{#A=O[>],#C=O[<],#B=[<]C[C@H][>]C(=O)OC}",
                         # 0 1 2 3
@@ -204,7 +204,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
                         [(0, 1), (0, 2), (2, 3), (2, 4),
                          (2, 5), (5, 6), (5, 7), (7, 8), (7, 9), (9, 10), (10, 11), (10, 12),
                          (10, 13), (5, 14), (14, 15)],
-                        {3: (2, 10, 4, 14)}, {}),
+                        {3: (2, 10, 4, 14)}, {}, {}),
                         # simple chirality assigment between fragments inv
                         ("{[#A][#B][#C]}.{#A=O[>],#C=O[<],#B=[<]C[C@@H][>]C(=O)OC}",
                         # 0 1 2 3
@@ -214,21 +214,21 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
                         [(0, 1), (0, 2), (2, 3), (2, 4),
                          (2, 5), (5, 6), (5, 7), (7, 8), (7, 9), (9, 10), (10, 11), (10, 12),
                          (10, 13), (5, 14), (14, 15)],
-                        {3: (2, 10, 14, 4)}, {}),
+                        {3: (2, 10, 14, 4)}, {}, {}),
                         # smiple ez isomerism assigment between fragments inv
                         ("{[#A][#B]}.{#A=CC(/F)=[$],#B=[$]=C(\F)C}",
                         [('A', 'C C F H H H'), ('B', 'C F C H H H')],
                         'C C F H H H F C C H H H',
                         [(0, 1), (1, 2), (0, 3), (0, 4),
                          (0, 5), (1, 7), (7, 6), (7, 8), (8, 9), (8, 10), (8, 11)],
-                        {}, {2: (2, 1, 6, 7, 'trans'), 7: (7, 6, 1, 2, 'trans')}),
+                        {}, {2: (2, 1, 6, 7, 'trans'), 7: (7, 6, 1, 2, 'trans')}, {}),
                         # simple ez isomerism assigment between fragments inv
                         ("{[#A][#B]}.{#A=CC(/F)=[$],#B=[$]=C(/F)C}",
                         [('A', 'C C F H H H'), ('B', 'C F C H H H')],
                         'C C F H H H F C C H H H',
                         [(0, 1), (1, 2), (0, 3), (0, 4),
                          (0, 5), (1, 7), (7, 6), (7, 8), (8, 9), (8, 10), (8, 11)],
-                        {}, {2: (2, 1, 6, 7, 'cis'), 7: (7, 6, 1, 2, 'cis')}),
+                        {}, {2: (2, 1, 6, 7, 'cis'), 7: (7, 6, 1, 2, 'cis')}, {}),
                         # test skip virtual nodes
                         ("{[#SP4]1.2[#SP4].3[#SP1r]1.[#TC4]23}.{#SP4=OC[$]C[$]O,#SP1r=[$]OC[$]CO}",
                         [('SP4', 'O C C O H H H H'), ('SP4', 'O C C O H H H H'),
@@ -238,9 +238,29 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
                          (3, 7), (8, 9), (8, 12), (9, 10), (9, 13), (10, 11), (10, 17),
                          (10, 14), (11, 15), (16, 17), (17, 18), (17, 20), (18, 19),
                          (18, 21), (18, 22), (19, 23)],
-                        {},{}),
+                        {},{}, {}),
+                        # test weights
+                        ("{[#SP4]1[#SP4][#SP1r]1}.{#SP4=[OH;0.5]C[$]C[$]O,#SP1r=[$]OC[$]CO}",
+                        [('SP4', 'O C C O H H H H'), ('SP4', 'O C C O H H H H'),
+                         ('SP1r', 'O C C O H H H H')],
+                        'O C C O H H H H O C C O H H H H O C C O H H H H',
+                        [(0, 1), (0, 4), (1, 2), (1, 9), (1, 5), (2, 3), (2, 16), (2, 6),
+                         (3, 7), (8, 9), (8, 12), (9, 10), (9, 13), (10, 11), (10, 17),
+                         (10, 14), (11, 15), (16, 17), (17, 18), (17, 20), (18, 19),
+                         (18, 21), (18, 22), (19, 23)],
+                        {},{}, {0: 0.5, 4: 0.5, 8: 0.5, 12: 0.5}),
+                        # test 2 weights
+                        ("{[#SP4]1[#SP4][#SP1r]1}.{#SP4=[OH;0.5][C;0.1][$]C[$]O,#SP1r=[$]OC[$]CO}",
+                        [('SP4', 'O C C O H H H H'), ('SP4', 'O C C O H H H H'),
+                         ('SP1r', 'O C C O H H H H')],
+                        'O C C O H H H H O C C O H H H H O C C O H H H H',
+                        [(0, 1), (0, 4), (1, 2), (1, 9), (1, 5), (2, 3), (2, 16), (2, 6),
+                         (3, 7), (8, 9), (8, 12), (9, 10), (9, 13), (10, 11), (10, 17),
+                         (10, 14), (11, 15), (16, 17), (17, 18), (17, 20), (18, 19),
+                         (18, 21), (18, 22), (19, 23)],
+                        {},{}, {0: 0.5, 1: 0.1, 5: 0.1, 4: 0.5, 8: 0.5, 9: 0.1, 12: 0.5, 13: 0.1}),
 ))
-def test_all_atom_resolve_molecule(smile, ref_frags, elements, ref_edges, chiral, ez):
+def test_all_atom_resolve_molecule(smile, ref_frags, elements, ref_edges, chiral, ez, weights):
     meta_mol, molecule = MoleculeResolver.from_string(smile).resolve()
 
     # loop and compare fragments first
@@ -278,6 +298,12 @@ def _ele_match(n1, n2):
     if ez:
         ez_assigned = nx.get_node_attributes(molecule, 'ez_isomer')
         assert ez == ez_assigned
+    # check weights
+    if weights:
+        mol_weights = {node: 1 for node in ref_graph}
+        mol_weights.update(weights)
+        weights_assigned = nx.get_node_attributes(molecule, 'weight')
+        assert mol_weights == weights_assigned
 
 @pytest.mark.parametrize('case, cgsmiles_str, ref_string',(
     # case 1: here only the meta-graph is described by the

From 985500ad705c268cb04abf9b8f4428fdda9b2e7f Mon Sep 17 00:00:00 2001
From: Fabian Gruenewald <f.grunewald@rug.nl>
Date: Mon, 14 Oct 2024 18:56:50 +0200
Subject: [PATCH 02/16] update tests

---
 cgsmiles/tests/test_cgsmile_parsing.py | 64 ++++++++++++++++----------
 cgsmiles/tests/test_sampler.py         |  1 +
 2 files changed, 40 insertions(+), 25 deletions(-)

diff --git a/cgsmiles/tests/test_cgsmile_parsing.py b/cgsmiles/tests/test_cgsmile_parsing.py
index 77333c9..2f947ec 100644
--- a/cgsmiles/tests/test_cgsmile_parsing.py
+++ b/cgsmiles/tests/test_cgsmile_parsing.py
@@ -183,94 +183,108 @@ def test_read_cgsmiles(smile, nodes, edges, orders):
     fragnames = nx.get_node_attributes(meta_mol, 'fragname')
     assert nodes == list(fragnames.values())
 
-@pytest.mark.parametrize('big_smile, smile, bonding, rs, ez',(
+@pytest.mark.parametrize('big_smile, smile, bonding, rs, ez, weights',(
                         # smiple symmetric bonding
                         ("[$]COC[$]",
                          "COC",
                         {0: ["$1"], 2: ["$1"]},
                         None,
+                        None,
                         None),
                         # smiple symmetric bonding with more than one name
                         ("[$1A]COC[$1A]",
                          "COC",
                         {0: ["$1A1"], 2: ["$1A1"]},
                         None,
+                        None,
                         None),
                         # smiple bonding multiletter atom
                         ("Clc[$]c[$]",
                          "Clcc",
                         {1: ["$1"], 2: ["$1"]},
                         None,
+                        None,
                         None),
                         # simple symmetric but with explicit hydrogen
                         ("[$][CH2]O[CH2][$]",
                          "[CH2]O[CH2]",
                         {0: ["$1"], 2: ["$1"]},
                         None,
+                        None,
                         None),
                         # smiple symmetric bonding; multiple descript
                         ("[$]COC[$][$1]",
                          "COC",
                         {0: ["$1"], 2: ["$1", "$11"]},
                         None,
+                        None,
                         None),
                         # named different bonding descriptors
                         ("[$1]CCCC[$2]",
                          "CCCC",
                         {0: ["$11"], 3: ["$21"]},
                         None,
+                        None,
                         None),
                         # ring and bonding descriptors
                         ("[$1]CC[$2]C1CCCCC1",
                          "CCC1CCCCC1",
                         {0: ["$11"], 1: ["$21"]},
                         None,
+                        None,
                         None),
                         # bonding descript. after branch
                         ("C(COC[$1])[$2]CCC[$3]",
                          "C(COC)CCC",
                         {0: ["$21"], 3: ["$11"], 6: ["$31"]},
                         None,
+                        None,
                         None),
                         # left rigth bonding desciptors
                         ("[>]COC[<]",
                         "COC",
                         {0: [">1"], 2: ["<1"]},
                         None,
+                        None,
                         None),
                         # simple chirality in residue
                         ("[>]C[C@](F)(B)N[<]",
                         "C[C](F)(B)N",
                         {0: [">1"], 4: ["<1"]},
                         {1: ('@', [])},
+                        None,
                         None),
                         # simple chirality inverse in residue
                         ("[>]C[C@@](F)(B)N[<]",
                         "C[C](F)(B)N",
                         {0: [">1"], 4: ["<1"]},
                         {1: ('@@', [])},
+                        None,
                         None),
                         # \ fragment split
                         ("[>]CC(\F)=[<]",
                         "CC(F)",
                         {0: [">1"], 1: ["<2"]},
                         None,
-                        {2: (2, 1, '\\')}),
+                        {2: (2, 1, '\\')},
+                        None),
                         # / fragment split
                         ("[>]CC(/F)=[<]",
                         "CC(F)",
                         {0: [">1"], 1: ["<2"]},
                         None,
-                        {2: (2, 1, '/')}),
+                        {2: (2, 1, '/')},
+                        None),
                         # both in one fragment
                         ("[>]CC(/F)=C(\F)C[<]",
                         "CC(F)=C(F)C",
                         {0: [">1"], 5: ["<1"]},
                         None,
-                        {2: (2, 1, '/'), 4: (4, 3, '\\')}),
+                        {2: (2, 1, '/'), 4: (4, 3, '\\')},
+                        None),
 ))
-def test_strip_bonding_descriptors(big_smile, smile, bonding, rs, ez):
-    new_smile, new_bonding, rs_isomers, ez_isomers = strip_bonding_descriptors(big_smile)
+def test_strip_bonding_descriptors(big_smile, smile, bonding, rs, ez, weights):
+    new_smile, new_bonding, rs_isomers, ez_isomers, weights = strip_bonding_descriptors(big_smile)
     assert new_smile == smile
     assert new_bonding == bonding
     if rs:
@@ -281,50 +295,50 @@ def test_strip_bonding_descriptors(big_smile, smile, bonding, rs, ez):
 @pytest.mark.parametrize('fragment_str, nodes, edges',(
                         # single fragment
                         ("{#PEO=[$]COC[$]}",
-                        {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3}),
-                                 (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0}),
-                                 (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3}),
+                        {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "weight": 1}),
+                                 (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "weight": 1}),
+                                 (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "weight": 1}),
                                 )},
                         {"PEO": [(0, 1), (1, 2)]}),
                         # single fragment but with explicit hydrogen in smiles
                         ("{#PEO=[$][CH2]O[CH2][$]}",
-                        {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2}),
-                                 (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0}),
-                                 (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2}),
+                        {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2, "weight": 1}),
+                                 (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "weight": 1}),
+                                 (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2, "weight": 1}),
                                 )},
                         {"PEO": [(0, 1), (1, 2)]}),
                         # test NH3 terminal
                         ("{#AMM=N[$]}",
-                        {"AMM": ((0, {"atomname": "N0", "fragname": "AMM", "bonding": ["$1"], "element": "N", "hcount": 3}),
+                        {"AMM": ((0, {"atomname": "N0", "fragname": "AMM", "bonding": ["$1"], "element": "N", "hcount": 3, "weight": 1}),
                                 )},
                         {"AMM": []}),
                         # single fragment + 1 terminal (i.e. only 1 bonding descrpt
                         ("{#PEO=[$]COC[$],#OHter=[$][OH]}",
-                        {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3}),
-                                 (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0}),
-                                 (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3}),
+                        {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "weight": 1}),
+                                 (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "weight": 1}),
+                                 (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "weight": 1}),
                                  ),
-                         "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O"}),)},
+                         "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "weight": 1}),)},
                         {"PEO": [(0, 1), (1, 2)],
                          "OHter": []}),
                         # single fragment + 1 terminal but multiple bond descritp.
                         # this adjust the hydrogen count
                         ("{#PEO=[$]COC[$][$1],#OHter=[$][OH]}",
-                        {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3}),
-                                 (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0}),
-                                 (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1", "$11"], "element": "C", "hcount": 3}),
+                        {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "weight": 1}),
+                                 (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "weight": 1}),
+                                 (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1", "$11"], "element": "C", "hcount": 3, "weight": 1}),
                                  ),
-                         "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "hcount": 1}),)},
+                         "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "hcount": 1, "weight": 1}),)},
                         {"PEO": [(0, 1), (1, 2)],
                          "OHter": []}),
                         # single fragment + 1 terminal but multiple bond descritp.
                         # but explicit hydrogen in the smiles string
                         ("{#PEO=[$][CH2]O[CH2][$][$1],#OHter=[$][OH]}",
-                        {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2}),
-                                 (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0}),
-                                 (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1", "$11"], "element": "C", "hcount": 2}),
+                        {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2, "weight": 1}),
+                                 (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "weight": 1}),
+                                 (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1", "$11"], "element": "C", "hcount": 2, "weight": 1}),
                                  ),
-                         "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "hcount": 1}),
+                         "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "hcount": 1, "weight": 1}),
                                    )},
                         {"PEO": [(0, 1), (1, 2),],
                          "OHter": []}),
diff --git a/cgsmiles/tests/test_sampler.py b/cgsmiles/tests/test_sampler.py
index e1b08cd..158178f 100644
--- a/cgsmiles/tests/test_sampler.py
+++ b/cgsmiles/tests/test_sampler.py
@@ -104,6 +104,7 @@ def test_add_fragment(graph_str,
     ref_graph = read_cgsmiles(ref_mol)
     nx.set_node_attributes(ref_graph, bonding, 'bonding')
     nx.set_node_attributes(ref_graph, fragid, 'fragid')
+    nx.set_node_attributes(ref_graph, 1, 'weight')
     atomnames = nx.get_node_attributes(ref_graph, 'fragname')
     nx.set_node_attributes(ref_graph, atomnames, 'atomname')
     nx.set_node_attributes(ref_graph, resnames, 'fragname')

From fd838e181acde097ff4b21b519a234681b02d9df Mon Sep 17 00:00:00 2001
From: Fabian Gruenewald <f.grunewald@rug.nl>
Date: Mon, 14 Oct 2024 19:02:10 +0200
Subject: [PATCH 03/16] add tests

---
 cgsmiles/tests/test_cgsmile_parsing.py | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/cgsmiles/tests/test_cgsmile_parsing.py b/cgsmiles/tests/test_cgsmile_parsing.py
index 2f947ec..3ca612d 100644
--- a/cgsmiles/tests/test_cgsmile_parsing.py
+++ b/cgsmiles/tests/test_cgsmile_parsing.py
@@ -191,6 +191,21 @@ def test_read_cgsmiles(smile, nodes, edges, orders):
                         None,
                         None,
                         None),
+                        # smiple symmetric bonding with weigth
+                        ("[$]C[O;0.5]C[$]",
+                         "C[O]C",
+                        {0: ["$1"], 2: ["$1"]},
+                        None,
+                        None,
+                        {1: 0.5}),
+                        # smiple symmetric bonding with weigth
+                        # using cgsmiles string
+                        ("[$][#TC4][#OT1;0.5][#CD1][$]",
+                         "[#TC4][#OT1][#CD1]",
+                        {0: ["$1"], 2: ["$1"]},
+                        None,
+                        None,
+                        {1: 0.5}),
                         # smiple symmetric bonding with more than one name
                         ("[$1A]COC[$1A]",
                          "COC",
@@ -284,13 +299,19 @@ def test_read_cgsmiles(smile, nodes, edges, orders):
                         None),
 ))
 def test_strip_bonding_descriptors(big_smile, smile, bonding, rs, ez, weights):
-    new_smile, new_bonding, rs_isomers, ez_isomers, weights = strip_bonding_descriptors(big_smile)
+    new_smile, new_bonding, rs_isomers, ez_isomers, weights_out = strip_bonding_descriptors(big_smile)
     assert new_smile == smile
     assert new_bonding == bonding
     if rs:
         assert rs == rs_isomers
     if ez:
         assert ez == ez_isomers
+    # here we check that the weights are correctly
+    # set for nodes with weights; the default is
+    # checked in another test
+    if weights:
+        for node, weight in weights.items():
+            assert weights_out[node] == weight
 
 @pytest.mark.parametrize('fragment_str, nodes, edges',(
                         # single fragment

From 4c4a12bf6308da4c5bacf67d2ad140cbb6f5f4de Mon Sep 17 00:00:00 2001
From: Fabian Gruenewald <f.grunewald@rug.nl>
Date: Thu, 31 Oct 2024 16:16:31 +0100
Subject: [PATCH 04/16] have hydrogen weights

---
 cgsmiles/read_fragments.py             | 18 ++++++++---
 cgsmiles/tests/test_cgsmile_parsing.py | 42 +++++++++++++++++++++-----
 2 files changed, 49 insertions(+), 11 deletions(-)

diff --git a/cgsmiles/read_fragments.py b/cgsmiles/read_fragments.py
index fa28829..61be22e 100644
--- a/cgsmiles/read_fragments.py
+++ b/cgsmiles/read_fragments.py
@@ -148,6 +148,7 @@ def strip_bonding_descriptors(fragment_string):
     ez_isomer_atoms = {}
     rs_isomers = {}
     weights = {}
+    hydrogen_weights = defaultdict(list)
     smile = ""
     node_count = 0
     prev_node = 0
@@ -185,7 +186,12 @@ def strip_bonding_descriptors(fragment_string):
                     # we have weights
                     elif peek == ';':
                         weight, smile_iter = get_weight(smile_iter)
-                        weights[node_count] = weight
+                        # hydrogen atoms are implicit so we filter
+                        # them out here
+                        if atom[1:] == 'H':
+                            hydrogen_weights[prev_node].append(weight)
+                        else:
+                            weights[node_count] = weight
                     else:
                         atom += peek
                     peek = next(smile_iter)
@@ -236,7 +242,7 @@ def strip_bonding_descriptors(fragment_string):
                 bonded_node = _find_bonded_ring_node(ring_nodes, node)
                 rs_isomers[node][1].append(bonded_node)
 
-    return smile, bonding_descrpt, rs_isomers, ez_isomer_atoms, weights
+    return smile, bonding_descrpt, rs_isomers, ez_isomer_atoms, weights, hydrogen_weights
 
 def fragment_iter(fragment_str, all_atom=True):
     """
@@ -266,13 +272,15 @@ def fragment_iter(fragment_str, all_atom=True):
         delim = fragment.find('=', 0)
         fragname = fragment[1:delim]
         frag_smile = fragment[delim+1:]
-        smile, bonding_descrpt, rs_isomers, ez_isomers, weights = strip_bonding_descriptors(frag_smile)
+        smile, bonding_descrpt, rs_isomers, ez_isomers, weights, h_weights = strip_bonding_descriptors(frag_smile)
         if smile == "H":
             mol_graph = nx.Graph()
             mol_graph.add_node(0, element="H", bonding=bonding_descrpt[0])
             nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding')
         elif all_atom:
-            mol_graph = pysmiles.read_smiles(smile, reinterpret_aromatic=False, strict=False)
+            mol_graph = pysmiles.read_smiles(smile,
+                                             reinterpret_aromatic=False,
+                                             strict=False)
             nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding')
             nx.set_node_attributes(mol_graph, rs_isomers, 'rs_isomer')
             # we need to split countable node keys and the associated value
@@ -280,6 +288,8 @@ def fragment_iter(fragment_str, all_atom=True):
             ez_isomer_class = {idx: val[-1] for idx, val in ez_isomers.items()}
             nx.set_node_attributes(mol_graph, ez_isomer_atoms, 'ez_isomer_atoms')
             nx.set_node_attributes(mol_graph, ez_isomer_class, 'ez_isomer_class')
+            # set the hydrogen weight attribute
+            nx.set_node_attributes(mol_graph, h_weights, 'hweight')
         # we deal with a CG resolution graph
         else:
             mol_graph = read_cgsmiles(smile)
diff --git a/cgsmiles/tests/test_cgsmile_parsing.py b/cgsmiles/tests/test_cgsmile_parsing.py
index 5da2572..5d0a8b5 100644
--- a/cgsmiles/tests/test_cgsmile_parsing.py
+++ b/cgsmiles/tests/test_cgsmile_parsing.py
@@ -262,35 +262,48 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
         set_charges = nx.get_node_attributes(meta_mol, 'charge')
         assert set_charges == charges
 
-@pytest.mark.parametrize('big_smile, smile, bonding, rs, ez',(
+@pytest.mark.parametrize('big_smile, smile, bonding, rs, ez, weights, hweights',(
   # smiple symmetric bonding
                         ("[$]COC[$]",
                          "COC",
                         {0: ["$1"], 2: ["$1"]},
                         None,
                         None,
+                        None,
                         None),
-                        # smiple symmetric bonding with weigth
+                        # smiple symmetric bonding with weight
                         ("[$]C[O;0.5]C[$]",
                          "C[O]C",
                         {0: ["$1"], 2: ["$1"]},
                         None,
                         None,
-                        {1: 0.5}),
-                        # smiple symmetric bonding with weigth
+                        {1: 0.5},
+                        None),
+                        # smiple symmetric bonding with weight
                         # using cgsmiles string
                         ("[$][#TC4][#OT1;0.5][#CD1][$]",
                          "[#TC4][#OT1][#CD1]",
                         {0: ["$1"], 2: ["$1"]},
                         None,
                         None,
-                        {1: 0.5}),
+                        {1: 0.5},
+                        None),
+                        # smiple symmetric bonding with weight
+                        # using open smiles and hweights
+                        ("[$]CO[C;0.5][$]([H;0.1])[H;0.2]",
+                         "CO[C]([H])[H]",
+                        {0: ["$1"], 2: ["$1"]},
+                        None,
+                        None,
+                        {2: 0.5},
+                        {2: [0.1, 0.2]}),
                         # smiple symmetric bonding with more than one name
                         ("[$1A]COC[$1A]",
                          "COC",
                         {0: ["$1A1"], 2: ["$1A1"]},
                         None,
                         None,
+                        None,
                         None),
                         # smiple bonding multiletter atom
                         ("Clc[$]c[$]",
@@ -298,6 +311,7 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
                         {1: ["$1"], 2: ["$1"]},
                         None,
                         None,
+                        None,
                         None),
                         # simple symmetric but with explicit hydrogen
                         ("[$][CH2]O[CH2][$]",
@@ -305,6 +319,7 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
                         {0: ["$1"], 2: ["$1"]},
                         None,
                         None,
+                        None,
                         None),
                         # smiple symmetric bonding; multiple descript
                         ("[$]COC[$][$1]",
@@ -312,6 +327,7 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
                         {0: ["$1"], 2: ["$1", "$11"]},
                         None,
                         None,
+                        None,
                         None),
                         # named different bonding descriptors
                         ("[$1]CCCC[$2]",
@@ -319,6 +335,7 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
                         {0: ["$11"], 3: ["$21"]},
                         None,
                         None,
+                        None,
                         None),
                         # ring and bonding descriptors
                         ("[$1]CC[$2]C1CCCCC1",
@@ -326,6 +343,7 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
                         {0: ["$11"], 1: ["$21"]},
                         None,
                         None,
+                        None,
                         None),
                         # bonding descript. after branch
                         ("C(COC[$1])[$2]CCC[$3]",
@@ -333,6 +351,7 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
                         {0: ["$21"], 3: ["$11"], 6: ["$31"]},
                         None,
                         None,
+                        None,
                         None),
                         # left rigth bonding desciptors
                         ("[>]COC[<]",
@@ -340,6 +359,7 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
                         {0: [">1"], 2: ["<1"]},
                         None,
                         None,
+                        None,
                         None),
                         # simple chirality in residue
                         ("[>]C[C@](F)(B)N[<]",
@@ -347,6 +367,7 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
                         {0: [">1"], 4: ["<1"]},
                         {1: ('@', [])},
                         None,
+                        None,
                         None),
                         # simple chirality inverse in residue
                         ("[>]C[C@@](F)(B)N[<]",
@@ -354,6 +375,7 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
                         {0: [">1"], 4: ["<1"]},
                         {1: ('@@', [])},
                         None,
+                        None,
                         None),
                         # \ fragment split
                         ("[>]CC(\F)=[<]",
@@ -361,6 +383,7 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
                         {0: [">1"], 1: ["<2"]},
                         None,
                         {2: (2, 1, '\\')},
+                        None,
                         None),
                         # / fragment split
                         ("[>]CC(/F)=[<]",
@@ -368,6 +391,7 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
                         {0: [">1"], 1: ["<2"]},
                         None,
                         {2: (2, 1, '/')},
+                        None,
                         None),
                         # both in one fragment
                         ("[>]CC(/F)=C(\F)C[<]",
@@ -375,10 +399,11 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
                         {0: [">1"], 5: ["<1"]},
                         None,
                         {2: (2, 1, '/'), 4: (4, 3, '\\')},
+                        None,
                         None),
 ))
-def test_strip_bonding_descriptors(big_smile, smile, bonding, rs, ez, weights):
-    new_smile, new_bonding, rs_isomers, ez_isomers, weights_out = strip_bonding_descriptors(big_smile)
+def test_strip_bonding_descriptors(big_smile, smile, bonding, rs, ez, weights, hweights):
+    new_smile, new_bonding, rs_isomers, ez_isomers, weights_out, hweights_out = strip_bonding_descriptors(big_smile)
     assert new_smile == smile
     assert new_bonding == bonding
     if rs:
@@ -391,6 +416,9 @@ def test_strip_bonding_descriptors(big_smile, smile, bonding, rs, ez, weights):
     if weights:
         for node, weight in weights.items():
             assert weights_out[node] == weight
+    if hweights:
+        for node, weight in hweights.items():
+            assert hweights_out[node] == weight
 
 @pytest.mark.parametrize('fragment_str, nodes, edges',(
                         # single fragment

From fa55b8765a9a4561fdadf7c2cb9fb6f2a5b2c1a1 Mon Sep 17 00:00:00 2001
From: Fabian Gruenewald <f.grunewald@rug.nl>
Date: Thu, 31 Oct 2024 16:52:35 +0100
Subject: [PATCH 05/16] tests for hydrogen weights

---
 cgsmiles/tests/test_cgsmile_parsing.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/cgsmiles/tests/test_cgsmile_parsing.py b/cgsmiles/tests/test_cgsmile_parsing.py
index 5d0a8b5..dfadbb6 100644
--- a/cgsmiles/tests/test_cgsmile_parsing.py
+++ b/cgsmiles/tests/test_cgsmile_parsing.py
@@ -297,6 +297,14 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
                         None,
                         {2: 0.5},
                         {2: [0.1, 0.2]}),
+                        # H atom with weight goes first
+                        ("[H;0.3]C[$]O[C;0.5][$]",
+                         "[H]CO[C]",
+                        {1: ["$1"], 3: ["$1"]},
+                        None,
+                        None,
+                        {3: 0.5},
+                        {1: [0.3]}),
                         # smiple symmetric bonding with more than one name
                         ("[$1A]COC[$1A]",
                          "COC",

From 39e2857932299fb37ed40ea066d8eaf868a805fb Mon Sep 17 00:00:00 2001
From: Fabian Gruenewald <f.grunewald@rug.nl>
Date: Thu, 31 Oct 2024 16:52:53 +0100
Subject: [PATCH 06/16] take care of case where h comes first

---
 cgsmiles/read_fragments.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cgsmiles/read_fragments.py b/cgsmiles/read_fragments.py
index 61be22e..6262e8e 100644
--- a/cgsmiles/read_fragments.py
+++ b/cgsmiles/read_fragments.py
@@ -188,7 +188,9 @@ def strip_bonding_descriptors(fragment_string):
                         weight, smile_iter = get_weight(smile_iter)
                         # hydrogen atoms are implicit so we filter
                         # them out here
-                        if atom[1:] == 'H':
+                        if atom[1:] == 'H' and node_count == 0:
+                            hydrogen_weights[1].append(weight)
+                        elif atom[1:] == 'H':
                             hydrogen_weights[prev_node].append(weight)
                         else:
                             weights[node_count] = weight

From 5c27942e930dd9adba619d998be2216e7269f921 Mon Sep 17 00:00:00 2001
From: Fabian Gruenewald <f.grunewald@rug.nl>
Date: Thu, 31 Oct 2024 16:53:16 +0100
Subject: [PATCH 07/16] have hydrogen weights reconstructed

---
 cgsmiles/pysmiles_utils.py              |  9 +++++++--
 cgsmiles/tests/test_molecule_resolve.py | 10 ++++++++++
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/cgsmiles/pysmiles_utils.py b/cgsmiles/pysmiles_utils.py
index 419efb3..ef7cd5a 100644
--- a/cgsmiles/pysmiles_utils.py
+++ b/cgsmiles/pysmiles_utils.py
@@ -84,9 +84,14 @@ def rebuild_h_atoms(mol_graph, keep_bonding=False):
             mol_graph.nodes[node]["fragid"] = mol_graph.nodes[ref_node]["fragid"]
             mol_graph.nodes[node]["fragname"] = mol_graph.nodes[ref_node]["fragname"]
         if mol_graph.nodes[node].get("element", "*") == "H":
-            # make sure the weights are copied for implicit h-atoms
             anchor = list(mol_graph.neighbors(node))[0]
-            weight = mol_graph.nodes[anchor].get("weight", 1)
+            # the weight for the hydrogen atom was explicitly set
+            hweights = mol_graph.nodes[anchor].get('hweight', [])
+            if hweights:
+                weight = hweights.pop()
+            # make sure the weights are copied for implicit h-atoms
+            else:
+                weight = mol_graph.nodes[anchor].get("weight", 1)
             mol_graph.nodes[node]["weight"] = weight
 
 def annotate_ez_isomers(molecule):
diff --git a/cgsmiles/tests/test_molecule_resolve.py b/cgsmiles/tests/test_molecule_resolve.py
index 00880d7..6dabcd7 100644
--- a/cgsmiles/tests/test_molecule_resolve.py
+++ b/cgsmiles/tests/test_molecule_resolve.py
@@ -259,6 +259,16 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
                          (10, 14), (11, 15), (16, 17), (17, 18), (17, 20), (18, 19),
                          (18, 21), (18, 22), (19, 23)],
                         {},{}, {0: 0.5, 1: 0.1, 5: 0.1, 4: 0.5, 8: 0.5, 9: 0.1, 12: 0.5, 13: 0.1}),
+                        # test 2 weights and hydrogen weights
+                        ("{[#SP4]1[#SP4][#SP1r]1}.{#SP4=[O;0.5]([H;0.2])[C;0.1][$]C[$]O,#SP1r=[$]OC[$]CO}",
+                        [('SP4', 'O C C O H H H H'), ('SP4', 'O C C O H H H H'),
+                         ('SP1r', 'O C C O H H H H')],
+                        'O C C O H H H H O C C O H H H H O C C O H H H H',
+                        [(0, 1), (0, 4), (1, 2), (1, 9), (1, 5), (2, 3), (2, 16), (2, 6),
+                         (3, 7), (8, 9), (8, 12), (9, 10), (9, 13), (10, 11), (10, 17),
+                         (10, 14), (11, 15), (16, 17), (17, 18), (17, 20), (18, 19),
+                         (18, 21), (18, 22), (19, 23)],
+                        {},{}, {0: 0.5, 1: 0.1, 5: 0.1, 4: 0.2, 8: 0.5, 9: 0.1, 12: 0.2, 13: 0.1}),
 ))
 def test_all_atom_resolve_molecule(smile, ref_frags, elements, ref_edges, chiral, ez, weights):
     meta_mol, molecule = MoleculeResolver.from_string(smile).resolve()

From 49ca226dbf0210eb15cdb8e8ba20ea1623b9e929 Mon Sep 17 00:00:00 2001
From: Fabian Gruenewald <f.grunewald@rug.nl>
Date: Fri, 8 Nov 2024 11:20:55 +0100
Subject: [PATCH 08/16] implement dialects and annotations

---
 cgsmiles/pysmiles_utils.py              |  4 +-
 cgsmiles/read_cgsmiles.py               | 29 ++++-----
 cgsmiles/read_fragments.py              | 66 ++++++++------------
 cgsmiles/tests/test_cgsmile_parsing.py  | 81 +++++++++++++++----------
 cgsmiles/tests/test_molecule_resolve.py | 19 +++---
 cgsmiles/tests/test_sampler.py          |  2 +-
 6 files changed, 100 insertions(+), 101 deletions(-)

diff --git a/cgsmiles/pysmiles_utils.py b/cgsmiles/pysmiles_utils.py
index ef7cd5a..28fe0c2 100644
--- a/cgsmiles/pysmiles_utils.py
+++ b/cgsmiles/pysmiles_utils.py
@@ -91,8 +91,8 @@ def rebuild_h_atoms(mol_graph, keep_bonding=False):
                 weight = hweights.pop()
             # make sure the weights are copied for implicit h-atoms
             else:
-                weight = mol_graph.nodes[anchor].get("weight", 1)
-            mol_graph.nodes[node]["weight"] = weight
+                weight = mol_graph.nodes[anchor].get("w", 1)
+            mol_graph.nodes[node]["w"] = weight
 
 def annotate_ez_isomers(molecule):
     """
diff --git a/cgsmiles/read_cgsmiles.py b/cgsmiles/read_cgsmiles.py
index fbbbe4e..aaf52dd 100644
--- a/cgsmiles/read_cgsmiles.py
+++ b/cgsmiles/read_cgsmiles.py
@@ -2,6 +2,7 @@
 import re
 import numpy as np
 import networkx as nx
+from .dialects import parse_graph_base_node
 
 PATTERNS = {"bond_anchor": r"\[\$.*?\]",
             "place_holder": r"\[\#.*?\]",
@@ -31,21 +32,21 @@ def _expand_branch(mol_graph, current, anchor, recipe):
     anchor: abc.hashable
         anchor to which to connect current node
 
-    recipe: list[(str, int, int)]
+    recipe: list[(str, int, dict, int)]
         list storing tuples of node names and
-        the number of times the node has to be added
-        and their bond order
+        the number of times the node has to be added,
+        a dict of attributes and the bond order
 
     Returns
     -------
     nx.Graph
     """
     prev_node = anchor
-    for bdx, (fragname, n_mon, order) in enumerate(recipe):
+    for bdx, (n_mon, attributes, order) in enumerate(recipe):
         if bdx == 0:
             anchor = current
         for _ in range(0, n_mon):
-            mol_graph.add_node(current, fragname=fragname)
+            mol_graph.add_node(current, **attributes)
             mol_graph.add_edge(prev_node, current, order=order)
 
             prev_node = current
@@ -144,7 +145,7 @@ def read_cgsmiles(pattern):
             # the recipe for making the branch includes the anchor;
             # which is hence the first residue in the list
             # at this point the bond order is still 1 unless we have an expansion
-            recipes[branch_anchor[-1]] = [(mol_graph.nodes[prev_node]['fragname'], 1, 1)]
+            recipes[branch_anchor[-1]] = [(1, attributes, 1)]
 
         # here we check if the atom is followed by a cycle marker
         # in this case we have an open cycle and close it
@@ -215,26 +216,18 @@ def read_cgsmiles(pattern):
         # the fragname starts at the second character and ends
         # one before the last according to the above pattern
         fragname = match.group(0)[2:-1]
-        # check for charge
-        charge = 0.0
-        for sign in ["+", "-"]:
-            if sign in fragname:
-                fragname, charge = fragname.split(sign)
-                if len(charge) == 0:
-                    charge = float(sign+"1")
-                else:
-                    charge = float(sign+charge)
+        # read the annotations
+        attributes = parse_graph_base_node(fragname)
 
         # if this residue is part of a branch we store it in
         # the recipe dict together with the anchor residue
         # and expansion number
         if branching:
-            recipes[branch_anchor[-1]].append((fragname, n_mon, prev_bond_order))
-
+            recipes[branch_anchor[-1]].append((n_mon, attributes, prev_bond_order))
         # new we add new residue as often as required
         connection = []
         for _ in range(0, n_mon):
-            mol_graph.add_node(current, fragname=fragname, charge=charge)
+            mol_graph.add_node(current, **attributes)
 
             if prev_node is not None:
                 mol_graph.add_edge(prev_node, current, order=prev_bond_order)
diff --git a/cgsmiles/read_fragments.py b/cgsmiles/read_fragments.py
index 6262e8e..d713477 100644
--- a/cgsmiles/read_fragments.py
+++ b/cgsmiles/read_fragments.py
@@ -5,6 +5,7 @@
 import networkx as nx
 import pysmiles
 from .read_cgsmiles import read_cgsmiles
+from .dialects import _fragment_node_parser
 
 class PeekIter(object):
     """
@@ -95,31 +96,6 @@ def collect_ring_number(smile_iter, token, node_count, rings):
 
     return smile_iter, token, partial_str, rings
 
-def get_weight(smile_iter):
-    """
-    Extracts weights given to atoms/nodes in
-    fragments. The iter should be advanced
-    up to the weight marker ;.
-
-    Parameters
-    ----------
-    smile_iter: class.PeekIter
-
-    Returns
-    -------
-    float:
-        the weight
-    PeekIter
-        the advanced iter object
-    """
-    num = []
-    for digit in smile_iter:
-        num.append(digit)
-        if smile_iter.peek() in [']', '@', 'H']:
-            break
-    out = float("".join(num))
-    return out, smile_iter
-
 def strip_bonding_descriptors(fragment_string):
     """
     Processes a CGSmiles fragment string by
@@ -147,7 +123,8 @@ def strip_bonding_descriptors(fragment_string):
     rings = defaultdict(list)
     ez_isomer_atoms = {}
     rs_isomers = {}
-    weights = {}
+    attributes = defaultdict(dict)
+    record_attributes = False
     hydrogen_weights = defaultdict(list)
     smile = ""
     node_count = 0
@@ -174,8 +151,7 @@ def strip_bonding_descriptors(fragment_string):
                 bonding_descrpt[prev_node].append(bond_descrp + str(order))
             else:
                 atom = token
-                # set the default weight
-                weights[node_count] = 1
+                attribute_str = ""
                 while peek != ']':
                     # deal with rs chirality
                     if peek == '@':
@@ -185,22 +161,29 @@ def strip_bonding_descriptors(fragment_string):
                         rs_isomers[node_count] = (chiral_token, [])
                     # we have weights
                     elif peek == ';':
-                        weight, smile_iter = get_weight(smile_iter)
-                        # hydrogen atoms are implicit so we filter
-                        # them out here
-                        if atom[1:] == 'H' and node_count == 0:
-                            hydrogen_weights[1].append(weight)
-                        elif atom[1:] == 'H':
-                            hydrogen_weights[prev_node].append(weight)
-                        else:
-                            weights[node_count] = weight
+                        record_attributes = True
+                    elif record_attributes:
+                        attribute_str += peek
                     else:
                         atom += peek
+
                     peek = next(smile_iter)
 
+                record_attributes=False
+                # here we do some post processing cleanup
+                node_attributes = _fragment_node_parser(attribute_str)
+                attributes[node_count].update(node_attributes)
+                # hydrogen atoms are implicit so we filter
+                # them out here
+                if atom[1:] == 'H' and node_count == 0:
+                    hydrogen_weights[1].append(attributes[node_count]['w'])
+                elif atom[1:] == 'H':
+                    hydrogen_weights[prev_node].append(attributes[node_count]['w'])
+
                 smile = smile + atom + "]"
                 prev_node = node_count
                 node_count += 1
+
         elif token == '(':
             anchor = prev_node
             smile += token
@@ -234,7 +217,7 @@ def strip_bonding_descriptors(fragment_string):
             current_order = None
             prev_node = node_count
             # set default weight
-            weights[node_count] = 1
+            attributes[node_count]['w'] = 1
             node_count += 1
 
     # we need to annotate rings to the chiral isomers
@@ -244,7 +227,7 @@ def strip_bonding_descriptors(fragment_string):
                 bonded_node = _find_bonded_ring_node(ring_nodes, node)
                 rs_isomers[node][1].append(bonded_node)
 
-    return smile, bonding_descrpt, rs_isomers, ez_isomer_atoms, weights, hydrogen_weights
+    return smile, bonding_descrpt, rs_isomers, ez_isomer_atoms, attributes, hydrogen_weights
 
 def fragment_iter(fragment_str, all_atom=True):
     """
@@ -274,7 +257,7 @@ def fragment_iter(fragment_str, all_atom=True):
         delim = fragment.find('=', 0)
         fragname = fragment[1:delim]
         frag_smile = fragment[delim+1:]
-        smile, bonding_descrpt, rs_isomers, ez_isomers, weights, h_weights = strip_bonding_descriptors(frag_smile)
+        smile, bonding_descrpt, rs_isomers, ez_isomers, attributes, h_weights = strip_bonding_descriptors(frag_smile)
         if smile == "H":
             mol_graph = nx.Graph()
             mol_graph.add_node(0, element="H", bonding=bonding_descrpt[0])
@@ -305,7 +288,8 @@ def fragment_iter(fragment_str, all_atom=True):
 
         nx.set_node_attributes(mol_graph, fragname, 'fragname')
         nx.set_node_attributes(mol_graph, 0, 'fragid')
-        nx.set_node_attributes(mol_graph, weights, 'weight')
+        # set other attributes
+        nx.set_node_attributes(mol_graph, attributes)
         yield fragname, mol_graph
 
 def read_fragments(fragment_str, all_atom=True, fragment_dict=None):
diff --git a/cgsmiles/tests/test_cgsmile_parsing.py b/cgsmiles/tests/test_cgsmile_parsing.py
index dfadbb6..2e2919f 100644
--- a/cgsmiles/tests/test_cgsmile_parsing.py
+++ b/cgsmiles/tests/test_cgsmile_parsing.py
@@ -11,7 +11,13 @@
                         [(0, 1), (1, 2)],
                         [1, 1]),
                         # smiple charges
-                        ("{[#PMA+][#PEO][#PMA-0.25]}",
+                        ("{[#PMA;+1][#PEO][#PMA;-0.25]}",
+                        ["PMA", "PEO", "PMA"],
+                        {0: 1.0, 1: 0.0, 2:-0.25},
+                        [(0, 1), (1, 2)],
+                        [1, 1]),
+                        # smiple charges with keyword
+                        ("{[#PMA;c=+1][#PEO][#PMA;c=-0.25]}",
                         ["PMA", "PEO", "PMA"],
                         {0: 1.0, 1: 0.0, 2:-0.25},
                         [(0, 1), (1, 2)],
@@ -256,13 +262,15 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
         assert meta_mol.edges[edge]["order"] == order
 
     fragnames = nx.get_node_attributes(meta_mol, 'fragname')
+    print(fragnames)
+    print(nodes)
     assert nodes == list(fragnames.values())
 
     if charges:
-        set_charges = nx.get_node_attributes(meta_mol, 'charge')
+        set_charges = nx.get_node_attributes(meta_mol, 'c')
         assert set_charges == charges
 
-@pytest.mark.parametrize('big_smile, smile, bonding, rs, ez, weights, hweights',(
+@pytest.mark.parametrize('big_smile, smile, bonding, rs, ez, attrs, hweights',(
   # smiple symmetric bonding
                         ("[$]COC[$]",
                          "COC",
@@ -277,7 +285,7 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
                         {0: ["$1"], 2: ["$1"]},
                         None,
                         None,
-                        {1: 0.5},
+                        {'w': {0: 1, 1: 0.5, 2: 1}},
                         None),
                         # smiple symmetric bonding with weight
                         # using cgsmiles string
@@ -286,8 +294,17 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
                         {0: ["$1"], 2: ["$1"]},
                         None,
                         None,
-                        {1: 0.5},
+                        {'w': {0: 1, 1: 0.5, 2: 1}},
                         None),
+                      # # smiple symmetric bonding with random
+                      # # keyword argument
+                      # ("[$][#TC4][#OT1;r=abc][#CD1][$]",
+                      #  "[#TC4][#OT1][#CD1]",
+                      # {0: ["$1"], 2: ["$1"]},
+                      # None,
+                      # None,
+                      # {'w': {0: 1, 1: 1, 2: 1}, 'r': {1: 'abc'}},
+                      # None),
                         # smiple symmetric bonding with weight
                         # using open smiles and hweights
                         ("[$]CO[C;0.5][$]([H;0.1])[H;0.2]",
@@ -295,7 +312,7 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
                         {0: ["$1"], 2: ["$1"]},
                         None,
                         None,
-                        {2: 0.5},
+                        {'w':{0: 1, 1: 1, 2: 0.5}},
                         {2: [0.1, 0.2]}),
                         # H atom with weight goes first
                         ("[H;0.3]C[$]O[C;0.5][$]",
@@ -303,7 +320,7 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
                         {1: ["$1"], 3: ["$1"]},
                         None,
                         None,
-                        {3: 0.5},
+                        {'w': {1: 1, 2: 1, 3: 0.5}},
                         {1: [0.3]}),
                         # smiple symmetric bonding with more than one name
                         ("[$1A]COC[$1A]",
@@ -410,8 +427,8 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
                         None,
                         None),
 ))
-def test_strip_bonding_descriptors(big_smile, smile, bonding, rs, ez, weights, hweights):
-    new_smile, new_bonding, rs_isomers, ez_isomers, weights_out, hweights_out = strip_bonding_descriptors(big_smile)
+def test_strip_bonding_descriptors(big_smile, smile, bonding, rs, ez, attrs, hweights):
+    new_smile, new_bonding, rs_isomers, ez_isomers, attrs_out, hweights_out = strip_bonding_descriptors(big_smile)
     assert new_smile == smile
     assert new_bonding == bonding
     if rs:
@@ -421,9 +438,11 @@ def test_strip_bonding_descriptors(big_smile, smile, bonding, rs, ez, weights, h
     # here we check that the weights are correctly
     # set for nodes with weights; the default is
     # checked in another test
-    if weights:
-        for node, weight in weights.items():
-            assert weights_out[node] == weight
+    print(attrs_out)
+    if attrs:
+        for attr, node_attrs in attrs.items():
+            for node, value in node_attrs.items():
+                assert attrs_out[node][attr] == value
     if hweights:
         for node, weight in hweights.items():
             assert hweights_out[node] == weight
@@ -431,50 +450,50 @@ def test_strip_bonding_descriptors(big_smile, smile, bonding, rs, ez, weights, h
 @pytest.mark.parametrize('fragment_str, nodes, edges',(
                         # single fragment
                         ("{#PEO=[$]COC[$]}",
-                        {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "weight": 1}),
-                                 (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "weight": 1}),
-                                 (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "weight": 1}),
+                        {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "w": 1}),
+                                 (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "w": 1}),
+                                 (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "w": 1}),
                                 )},
                         {"PEO": [(0, 1), (1, 2)]}),
                         # single fragment but with explicit hydrogen in smiles
                         ("{#PEO=[$][CH2]O[CH2][$]}",
-                        {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2, "weight": 1}),
-                                 (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "weight": 1}),
-                                 (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2, "weight": 1}),
+                        {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2, "w": 1}),
+                                 (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "w": 1}),
+                                 (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2, "w": 1}),
                                 )},
                         {"PEO": [(0, 1), (1, 2)]}),
                         # test NH3 terminal
                         ("{#AMM=N[$]}",
-                        {"AMM": ((0, {"atomname": "N0", "fragname": "AMM", "bonding": ["$1"], "element": "N", "hcount": 3, "weight": 1}),
+                        {"AMM": ((0, {"atomname": "N0", "fragname": "AMM", "bonding": ["$1"], "element": "N", "hcount": 3, "w": 1}),
                                 )},
                         {"AMM": []}),
                         # single fragment + 1 terminal (i.e. only 1 bonding descrpt
                         ("{#PEO=[$]COC[$],#OHter=[$][OH]}",
-                        {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "weight": 1}),
-                                 (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "weight": 1}),
-                                 (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "weight": 1}),
+                        {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "w": 1}),
+                                 (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "w": 1}),
+                                 (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "w": 1}),
                                  ),
-                         "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "weight": 1}),)},
+                         "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "w": 1}),)},
                         {"PEO": [(0, 1), (1, 2)],
                          "OHter": []}),
                         # single fragment + 1 terminal but multiple bond descritp.
                         # this adjust the hydrogen count
                         ("{#PEO=[$]COC[$][$1],#OHter=[$][OH]}",
-                        {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "weight": 1}),
-                                 (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "weight": 1}),
-                                 (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1", "$11"], "element": "C", "hcount": 3, "weight": 1}),
+                        {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "w": 1}),
+                                 (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "w": 1}),
+                                 (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1", "$11"], "element": "C", "hcount": 3, "w": 1}),
                                  ),
-                         "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "hcount": 1, "weight": 1}),)},
+                         "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "hcount": 1, "w": 1}),)},
                         {"PEO": [(0, 1), (1, 2)],
                          "OHter": []}),
                         # single fragment + 1 terminal but multiple bond descritp.
                         # but explicit hydrogen in the smiles string
                         ("{#PEO=[$][CH2]O[CH2][$][$1],#OHter=[$][OH]}",
-                        {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2, "weight": 1}),
-                                 (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "weight": 1}),
-                                 (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1", "$11"], "element": "C", "hcount": 2, "weight": 1}),
+                        {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2, "w": 1}),
+                                 (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "w": 1}),
+                                 (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1", "$11"], "element": "C", "hcount": 2, "w": 1}),
                                  ),
-                         "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "hcount": 1, "weight": 1}),
+                         "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "hcount": 1, "w": 1}),
                                    )},
                         {"PEO": [(0, 1), (1, 2),],
                          "OHter": []}),
diff --git a/cgsmiles/tests/test_molecule_resolve.py b/cgsmiles/tests/test_molecule_resolve.py
index 6dabcd7..65656a3 100644
--- a/cgsmiles/tests/test_molecule_resolve.py
+++ b/cgsmiles/tests/test_molecule_resolve.py
@@ -248,7 +248,9 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
                          (3, 7), (8, 9), (8, 12), (9, 10), (9, 13), (10, 11), (10, 17),
                          (10, 14), (11, 15), (16, 17), (17, 18), (17, 20), (18, 19),
                          (18, 21), (18, 22), (19, 23)],
-                        {},{}, {0: 0.5, 4: 0.5, 8: 0.5, 12: 0.5}),
+                        {},{}, {0: 0.5, 1: 1, 2: 1, 3: 1, 4: 0.5, 5: 1, 6: 1, 7: 1, 8: 0.5,
+                        9: 1, 10: 1, 11: 1, 12: 0.5, 13: 1, 14: 1, 15: 1, 16: 1, 17: 1,
+                        18: 1, 19: 1, 20: 1, 21: 1, 22: 1, 23: 1}),
                         # test 2 weights
                         ("{[#SP4]1[#SP4][#SP1r]1}.{#SP4=[OH;0.5][C;0.1][$]C[$]O,#SP1r=[$]OC[$]CO}",
                         [('SP4', 'O C C O H H H H'), ('SP4', 'O C C O H H H H'),
@@ -312,7 +314,7 @@ def _ele_match(n1, n2):
     if weights:
         mol_weights = {node: 1 for node in ref_graph}
         mol_weights.update(weights)
-        weights_assigned = nx.get_node_attributes(molecule, 'weight')
+        weights_assigned = nx.get_node_attributes(molecule, 'w')
         assert mol_weights == weights_assigned
 
 @pytest.mark.parametrize('case, cgsmiles_str, ref_string',(
@@ -342,13 +344,14 @@ def _atomname_match(n1, n2):
         return n1["fragname"] == n2["atomname"]
     assert nx.is_isomorphic(ref_graph, molecule, node_match=_atomname_match)
 
-@pytest.mark.parametrize('cgsmiles_str, error_message',(
-(("{[#A][#B]}.{#A=CC[$]}", "Found node #B but no corresponding fragment."),
- ("{[#A][#B]1}.{#A=CC[$],#B=OC[$]}", "You have a dangling ring index."),
- ("{[#A]1[#B]1}{#A=CC[$],#B=OC[$]}", "You define two edges between the same node. Use bond order symbols instead."),
+@pytest.mark.parametrize('cgsmiles_str, error_message, error_type',(
+(("{[#A][#B]}.{#A=CC[$]}", "Found node #B but no corresponding fragment.", SyntaxError),
+ ("{[#A][#B]1}.{#A=CC[$],#B=OC[$]}", "You have a dangling ring index.", SyntaxError),
+ ("{[#A]1[#B]1}{#A=CC[$],#B=OC[$]}", "You define two edges between the same node. Use bond order symbols instead.", SyntaxError),
+ ("{[#A;w=abc][#B]}.{#A=CC[$],#B=OC[$]}", "Argument 'w' must be of type float.", TypeError),
 )))
-def test_syntax_errors(cgsmiles_str, error_message):
-    with pytest.raises(SyntaxError) as e_message:
+def test_syntax_errors(cgsmiles_str, error_message, error_type):
+    with pytest.raises(error_type) as e_message:
         resolver = MoleculeResolver.from_string(cgsmiles_str)
         cg_mol, aa_mol = resolver.resolve()
         assert e_message == error_message
diff --git a/cgsmiles/tests/test_sampler.py b/cgsmiles/tests/test_sampler.py
index 158178f..8ccfd01 100644
--- a/cgsmiles/tests/test_sampler.py
+++ b/cgsmiles/tests/test_sampler.py
@@ -104,7 +104,7 @@ def test_add_fragment(graph_str,
     ref_graph = read_cgsmiles(ref_mol)
     nx.set_node_attributes(ref_graph, bonding, 'bonding')
     nx.set_node_attributes(ref_graph, fragid, 'fragid')
-    nx.set_node_attributes(ref_graph, 1, 'weight')
+    nx.set_node_attributes(ref_graph, 1, 'w')
     atomnames = nx.get_node_attributes(ref_graph, 'fragname')
     nx.set_node_attributes(ref_graph, atomnames, 'atomname')
     nx.set_node_attributes(ref_graph, resnames, 'fragname')

From df6c8d89fecbfc199cf2cbde5b5170a1cf46b5d0 Mon Sep 17 00:00:00 2001
From: Fabian Gruenewald <f.grunewald@rug.nl>
Date: Fri, 8 Nov 2024 11:22:57 +0100
Subject: [PATCH 09/16] implement dialects and annotations

---
 cgsmiles/dialects.py | 78 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)
 create mode 100644 cgsmiles/dialects.py

diff --git a/cgsmiles/dialects.py b/cgsmiles/dialects.py
new file mode 100644
index 0000000..31ddb14
--- /dev/null
+++ b/cgsmiles/dialects.py
@@ -0,0 +1,78 @@
+from inspect import signature, Signature, Parameter
+from functools import partial
+
+def check_and_cast_types(bound_args, signature):
+    for name, value in bound_args.arguments.items():
+        param = signature.parameters.get(name)
+        # Check if a type annotation is present
+        if param and param.annotation != Parameter.empty:
+            expected_type = param.annotation
+
+        # Attempt type casting if the value is not of the expected type
+        if not isinstance(value, expected_type):
+            try:
+               bound_args.arguments[name] = expected_type(value)
+            except (TypeError, ValueError):
+                raise TypeError(f"Argument '{name}' must be of type {expected_type.__name__}")
+    return bound_args
+
+def _parse_node(string_iteratable, dialect_signature):
+    """
+    This base function parsers a CGSmiles node. It must be
+    decorated with a signature which defines the dialect.
+    The dialect sets expected labels and default values of
+    a given node.
+    """
+    args_found = []
+    kwargs_found = {}
+    if len(string_iteratable) > 0:
+        elements = string_iteratable.split(';')
+        for entry in elements:
+            key_value = entry.split('=')
+            if len(key_value) == 1:
+                args_found.append(key_value[0])
+            else:
+                kwargs_found[key_value[0]] = key_value[1]
+
+    applied_labels = dialect_signature.bind(*args_found,
+                                            **kwargs_found)
+    applied_labels = check_and_cast_types(applied_labels,
+                                          dialect_signature)
+    applied_labels.apply_defaults()
+    return applied_labels.arguments
+
+def create_dialect(default_attributes):
+    """
+    Creates a signature of default attributes.
+    Note that the order of the entries in the dict
+    determines the order of the args accapted.
+    """
+    parameters = []
+    for argname, default_value in default_attributes.items():
+        arg_type = type(default_value)
+        parameters.append(Parameter(argname,
+                                    Parameter.POSITIONAL_OR_KEYWORD,
+                                    default=default_value,
+                                    annotation=arg_type))
+    sig = Signature(parameters)
+    return sig
+
+##########################################################
+#                   KNOWN DIALECTS                       #
+##########################################################
+# this one is for global use
+# it is the base CGSmiles dialect
+GRAPH_BASE = create_dialect({"fragname": "NaN",
+                             "c": 0.0,
+                             "w": 1.0})
+parse_graph_base_node = partial(_parse_node, dialect_signature=GRAPH_BASE)
+# this one is an internal fukery until the pysmiles
+# base parser is available
+# it just strips the kwargs from fragments before
+# they go to the respective parser
+# in case of cgsmiles fragments it is a bit doing
+# double the work
+fragment_base = create_dialect({"w": 1.0})
+print(GRAPH_BASE)
+print(fragment_base)
+_fragment_node_parser = partial(_parse_node, dialect_signature=fragment_base)

From 7f43dcf63c163bb719a5f56a86019a8227721021 Mon Sep 17 00:00:00 2001
From: Fabian Gruenewald <f.grunewald@rug.nl>
Date: Thu, 14 Nov 2024 17:38:26 +0100
Subject: [PATCH 10/16] address some comments

---
 cgsmiles/dialects.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/cgsmiles/dialects.py b/cgsmiles/dialects.py
index 31ddb14..c5b3820 100644
--- a/cgsmiles/dialects.py
+++ b/cgsmiles/dialects.py
@@ -16,7 +16,10 @@ def check_and_cast_types(bound_args, signature):
                 raise TypeError(f"Argument '{name}' must be of type {expected_type.__name__}")
     return bound_args
 
-def _parse_node(string_iteratable, dialect_signature):
+def _parse_node(string_iterable,
+                dialect_signature,
+                annotation_sep_token=';',
+                annotation_assign_token='='):
     """
     This base function parsers a CGSmiles node. It must be
     decorated with a signature which defines the dialect.
@@ -25,10 +28,10 @@ def _parse_node(string_iteratable, dialect_signature):
     """
     args_found = []
     kwargs_found = {}
-    if len(string_iteratable) > 0:
-        elements = string_iteratable.split(';')
+    if len(string_iterable) > 0:
+        elements = string_iterable.split(annotation_sep_token)
         for entry in elements:
-            key_value = entry.split('=')
+            key_value = entry.split(annotation_assign_token)
             if len(key_value) == 1:
                 args_found.append(key_value[0])
             else:
@@ -43,9 +46,9 @@ def _parse_node(string_iteratable, dialect_signature):
 
 def create_dialect(default_attributes):
     """
-    Creates a signature of default attributes.
+    Creates a signature of default annotations.
     Note that the order of the entries in the dict
-    determines the order of the args accapted.
+    determines the order of the args accepted.
     """
     parameters = []
     for argname, default_value in default_attributes.items():
@@ -73,6 +76,4 @@ def create_dialect(default_attributes):
 # in case of cgsmiles fragments it is a bit doing
 # double the work
 fragment_base = create_dialect({"w": 1.0})
-print(GRAPH_BASE)
-print(fragment_base)
 _fragment_node_parser = partial(_parse_node, dialect_signature=fragment_base)

From 8196158b8201948824b855b1c7d7b2118b3a9055 Mon Sep 17 00:00:00 2001
From: Fabian Gruenewald <f.grunewald@rug.nl>
Date: Sun, 17 Nov 2024 19:16:53 +0100
Subject: [PATCH 11/16] fix bugs regarding kwargs; raise more verbose errors

---
 cgsmiles/dialects.py                   | 95 ++++++++++++++++++++------
 cgsmiles/read_fragments.py             |  7 +-
 cgsmiles/tests/test_cgsmile_parsing.py |  8 +++
 cgsmiles/tests/test_write_cgsmiles.py  |  6 ++
 4 files changed, 93 insertions(+), 23 deletions(-)

diff --git a/cgsmiles/dialects.py b/cgsmiles/dialects.py
index c5b3820..1ecb6fd 100644
--- a/cgsmiles/dialects.py
+++ b/cgsmiles/dialects.py
@@ -7,44 +7,94 @@ def check_and_cast_types(bound_args, signature):
         # Check if a type annotation is present
         if param and param.annotation != Parameter.empty:
             expected_type = param.annotation
-
-        # Attempt type casting if the value is not of the expected type
-        if not isinstance(value, expected_type):
-            try:
-               bound_args.arguments[name] = expected_type(value)
-            except (TypeError, ValueError):
-                raise TypeError(f"Argument '{name}' must be of type {expected_type.__name__}")
+            # Attempt type casting if the value is not of the expected type
+            if not isinstance(value, expected_type):
+                try:
+                   bound_args.arguments[name] = expected_type(value)
+                except (TypeError, ValueError):
+                    raise TypeError(f"Argument '{name}' must be of type {expected_type.__name__}")
     return bound_args
 
-def _parse_node(string_iterable,
-                dialect_signature,
-                annotation_sep_token=';',
-                annotation_assign_token='='):
+def _parse_dialect_string(string_iterable,
+                          dialect_signature,
+                          annotation_sep_token=';',
+                          annotation_assign_token='='):
     """
-    This base function parsers a CGSmiles node. It must be
-    decorated with a signature which defines the dialect.
-    The dialect sets expected labels and default values of
-    a given node.
+    This base function parsers a string that describes key value pairs
+    in having a pattern of:
+
+    key<annotation_assign_token>value<annotation_sep_token>key ...
+
+    Default values, non-keyword agruments and types are defined using the
+    dialect signature object. If args are defined the key and assignment
+    token may be omitted.
+
+    Neither the `annotation_sep_token` nor the `annotation_assign_token`
+    can be part of key or value. A SyntaxError is raised in this case.
+
+    Parameters
+    ----------
+    string_iterable: iter
+        the string or iter object that contains the string
+    dialect_signature: cls.inspec.Signature
+        a signature defineing args, kwargs, default values
+        and types
+    annotation_sep_token: str
+        character used to seperate key value pairs
+    annotation_assign_token: str
+        character used to assign a key from a value
+
+    Returns
+    -------
+    dict
+       dict of key value paris
+
+    Raises
+    ------
+    SyntaxError
+        an error is raised if the signature does not match or
+        too many annotation_assign_token are given
     """
     args_found = []
     kwargs_found = {}
     if len(string_iterable) > 0:
         elements = string_iterable.split(annotation_sep_token)
         for entry in elements:
+            if entry.count('=') > 1:
+                # this takes care of too many '=' chacaters
+                msg = (f"Your annotation {entry} contains too many "
+                       f"{annotation_assign_token} charachters. Only"
+                        "chacracter per key value pair is allowed")
+                raise SyntaxError(msg)
             key_value = entry.split(annotation_assign_token)
+
             if len(key_value) == 1:
                 args_found.append(key_value[0])
             else:
                 kwargs_found[key_value[0]] = key_value[1]
 
-    applied_labels = dialect_signature.bind(*args_found,
-                                            **kwargs_found)
+    try:
+        applied_labels = dialect_signature.bind(*args_found,
+                                                **kwargs_found)
+    except TypeError as emsg:
+        print(emsg)
+        msg = ("You have too many positional arguments or "
+               f"{annotation_sep_token} as part of key value "
+                "pairs which is not allowed.")
+        raise SyntaxError(msg)
+
     applied_labels = check_and_cast_types(applied_labels,
                                           dialect_signature)
     applied_labels.apply_defaults()
-    return applied_labels.arguments
+    # if there are kwargs we need to put them into
+    # output dict
+    out_args = {}
+    out_args.update(applied_labels.arguments['kwargs'])
+    del applied_labels.arguments['kwargs']
+    out_args.update(applied_labels.arguments)
+    return out_args
 
-def create_dialect(default_attributes):
+def create_dialect(default_attributes, accept_kwargs=True):
     """
     Creates a signature of default annotations.
     Note that the order of the entries in the dict
@@ -57,6 +107,9 @@ def create_dialect(default_attributes):
                                     Parameter.POSITIONAL_OR_KEYWORD,
                                     default=default_value,
                                     annotation=arg_type))
+    if accept_kwargs:
+        parameters.append(Parameter('kwargs',
+                                    kind=Parameter.VAR_KEYWORD))
     sig = Signature(parameters)
     return sig
 
@@ -68,7 +121,7 @@ def create_dialect(default_attributes):
 GRAPH_BASE = create_dialect({"fragname": "NaN",
                              "c": 0.0,
                              "w": 1.0})
-parse_graph_base_node = partial(_parse_node, dialect_signature=GRAPH_BASE)
+parse_graph_base_node = partial(_parse_dialect_string, dialect_signature=GRAPH_BASE)
 # this one is an internal fukery until the pysmiles
 # base parser is available
 # it just strips the kwargs from fragments before
@@ -76,4 +129,4 @@ def create_dialect(default_attributes):
 # in case of cgsmiles fragments it is a bit doing
 # double the work
 fragment_base = create_dialect({"w": 1.0})
-_fragment_node_parser = partial(_parse_node, dialect_signature=fragment_base)
+_fragment_node_parser = partial(_parse_dialect_string, dialect_signature=fragment_base)
diff --git a/cgsmiles/read_fragments.py b/cgsmiles/read_fragments.py
index d713477..779e7a7 100644
--- a/cgsmiles/read_fragments.py
+++ b/cgsmiles/read_fragments.py
@@ -1,12 +1,16 @@
 """
 Functions for reading the fragment list.
 """
+import logging
 from collections import defaultdict
 import networkx as nx
 import pysmiles
 from .read_cgsmiles import read_cgsmiles
 from .dialects import _fragment_node_parser
 
+logger = logging.getLogger('pysmiles')
+logger.setLevel(level=logging.ERROR)
+
 class PeekIter(object):
     """
     Custom iter that allows looking ahead, without
@@ -160,13 +164,12 @@ def strip_bonding_descriptors(fragment_string):
                             chiral_token = '@' + next(smile_iter)
                         rs_isomers[node_count] = (chiral_token, [])
                     # we have weights
-                    elif peek == ';':
+                    elif peek == ';' and not record_attributes:
                         record_attributes = True
                     elif record_attributes:
                         attribute_str += peek
                     else:
                         atom += peek
-
                     peek = next(smile_iter)
 
                 record_attributes=False
diff --git a/cgsmiles/tests/test_cgsmile_parsing.py b/cgsmiles/tests/test_cgsmile_parsing.py
index 2e2919f..d2391de 100644
--- a/cgsmiles/tests/test_cgsmile_parsing.py
+++ b/cgsmiles/tests/test_cgsmile_parsing.py
@@ -287,6 +287,14 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
                         None,
                         {'w': {0: 1, 1: 0.5, 2: 1}},
                         None),
+                        # smiple kwarg not part of the defaults
+                        ("[$]C[O;q=4;p=s][C;q=3;p=l][$]",
+                         "C[O][C]",
+                        {0: ["$1"], 2: ["$1"]},
+                        None,
+                        None,
+                        {'q': {1: '4', 2: '3'}, 'p': {1: 's', 2: 'l'}},
+                        None),
                         # smiple symmetric bonding with weight
                         # using cgsmiles string
                         ("[$][#TC4][#OT1;0.5][#CD1][$]",
diff --git a/cgsmiles/tests/test_write_cgsmiles.py b/cgsmiles/tests/test_write_cgsmiles.py
index 8bfae93..836c3bf 100644
--- a/cgsmiles/tests/test_write_cgsmiles.py
+++ b/cgsmiles/tests/test_write_cgsmiles.py
@@ -22,8 +22,13 @@
 ))
 def test_write_fragments(input_string):
     frag_dict = read_fragments(input_string)
+    for g in frag_dict.values():
+        print(g.nodes(data=True))
     out_string = write_cgsmiles_fragments(frag_dict, smiles_format=True)
     frag_dict_out = read_fragments(out_string)
+    for g in frag_dict_out.values():
+        print(g.nodes(data=True))
+    print(out_string)
     assert set(frag_dict_out) == set(frag_dict)
     for fragname in frag_dict:
         assertEqualGraphs(frag_dict_out[fragname], frag_dict[fragname])
@@ -58,6 +63,7 @@ def test_write_cgsmiles(input_string):
     fragment_dicts = resolver.fragment_dicts
     molecule = resolver.molecule
     output_string = write_cgsmiles(molecule, fragment_dicts)
+    print(output_string)
     out_resolver =  MoleculeResolver.from_string(output_string)
     out_mol = out_resolver.molecule
     assertEqualGraphs(molecule, out_mol)

From 351c15c0183f7cb177548af94f903eb63dd00196 Mon Sep 17 00:00:00 2001
From: Fabian Gruenewald <f.grunewald@rug.nl>
Date: Sun, 17 Nov 2024 19:21:20 +0100
Subject: [PATCH 12/16] test more verbose errors

---
 cgsmiles/tests/test_molecule_resolve.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cgsmiles/tests/test_molecule_resolve.py b/cgsmiles/tests/test_molecule_resolve.py
index f4569ba..4e62d6f 100644
--- a/cgsmiles/tests/test_molecule_resolve.py
+++ b/cgsmiles/tests/test_molecule_resolve.py
@@ -355,6 +355,8 @@ def _atomname_match(n1, n2):
  ("{[#A][#B]1}.{#A=CC[$],#B=OC[$]}", "You have a dangling ring index.", SyntaxError),
  ("{[#A]1[#B]1}{#A=CC[$],#B=OC[$]}", "You define two edges between the same node. Use bond order symbols instead.", SyntaxError),
  ("{[#A;w=abc][#B]}.{#A=CC[$],#B=OC[$]}", "Argument 'w' must be of type float.", TypeError),
+ ("{[#A;w=ab=c][#B]}.{#A=CC[$],#B=OC[$]}", "Your annotation w=ab=c contains too many = charachters. Only one chacracter per key value pair is allowed", SyntaxError),
+ ("{[#A;w=1,c=1,q=a;d][#B]}.{#A=CC[$],#B=OC[$]}", "You have too many positional arguments or ; as part of key value pairs which is not allowed.", SyntaxError),
 )))
 def test_syntax_errors(cgsmiles_str, error_message, error_type):
     with pytest.raises(error_type) as e_message:

From 73fc426c7e776b608387e4f079eaee819f069a6a Mon Sep 17 00:00:00 2001
From: Fabian Gruenewald <f.grunewald@rug.nl>
Date: Sun, 17 Nov 2024 19:28:28 +0100
Subject: [PATCH 13/16] rename base dialect to default dialect

---
 cgsmiles/dialects.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cgsmiles/dialects.py b/cgsmiles/dialects.py
index 1ecb6fd..91c8c71 100644
--- a/cgsmiles/dialects.py
+++ b/cgsmiles/dialects.py
@@ -118,10 +118,10 @@ def create_dialect(default_attributes, accept_kwargs=True):
 ##########################################################
 # this one is for global use
 # it is the base CGSmiles dialect
-GRAPH_BASE = create_dialect({"fragname": "NaN",
-                             "c": 0.0,
-                             "w": 1.0})
-parse_graph_base_node = partial(_parse_dialect_string, dialect_signature=GRAPH_BASE)
+CGSMILES_DEFAULT_DIALECT = create_dialect({"fragname": "NaN",
+                                           "c": 0.0,
+                                           "w": 1.0})
+parse_graph_base_node = partial(_parse_dialect_string, dialect_signature=CGSMILES_DEFAULT_DIALECT)
 # this one is an internal fukery until the pysmiles
 # base parser is available
 # it just strips the kwargs from fragments before

From 91f77aa943a2123d1b6c13884a2745d06fc08362 Mon Sep 17 00:00:00 2001
From: Fabian Gruenewald <f.grunewald@rug.nl>
Date: Mon, 18 Nov 2024 12:03:33 +0100
Subject: [PATCH 14/16] enable explicit hatoms

---
 cgsmiles/cgsmiles_utils.py              |  33 ++++++
 cgsmiles/dialects.py                    |   7 +-
 cgsmiles/graph_utils.py                 |   1 -
 cgsmiles/pysmiles_utils.py              | 137 +++++++++++++++++++-----
 cgsmiles/read_fragments.py              |  57 +++-------
 cgsmiles/tests/test_cgsmile_parsing.py  |  55 +++-------
 cgsmiles/tests/test_molecule_resolve.py |  19 ++--
 7 files changed, 189 insertions(+), 120 deletions(-)

diff --git a/cgsmiles/cgsmiles_utils.py b/cgsmiles/cgsmiles_utils.py
index ef723da..6863fe0 100644
--- a/cgsmiles/cgsmiles_utils.py
+++ b/cgsmiles/cgsmiles_utils.py
@@ -1,5 +1,6 @@
 from collections import defaultdict
 import networkx as nx
+from .read_cgsmiles import read_cgsmiles
 
 def find_complementary_bonding_descriptor(bonding_descriptor, ellegible_descriptors=None):
     """
@@ -64,3 +65,35 @@ def find_open_bonds(molecule, target_nodes=None):
             for bonding_types in bonding_types:
                 open_bonds_by_descriptor[bonding_types].append(node)
     return open_bonds_by_descriptor
+
+def read_fragment_cgsmiles(cgsmiles_str,
+                           fragname,
+                           bonding_descrpt={},
+                           attributes={}):
+    """
+    Read a smiles_str corresponding to a CGSmiles fragment and
+    annotate bonding descriptors, isomers, as well as any other
+    attributes.
+
+    Parameters
+    ----------
+    smiles_str: str
+        string in CGSmiles format
+    fragname: str
+        the name of the fragment
+    attributes: dict
+
+    Returns
+    -------
+    nx.Graph
+        the graph of the molecular fragment
+    """
+    mol_graph = read_cgsmiles(cgsmiles_str)
+    fragnames = nx.get_node_attributes(mol_graph, 'fragname')
+    nx.set_node_attributes(mol_graph, fragnames, 'atomname')
+    nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding')
+    nx.set_node_attributes(mol_graph, fragname, 'fragname')
+    nx.set_node_attributes(mol_graph, 0, 'fragid')
+    nx.set_node_attributes(mol_graph, 1, 'w')
+    nx.set_node_attributes(mol_graph, attributes)
+    return mol_graph
diff --git a/cgsmiles/dialects.py b/cgsmiles/dialects.py
index 91c8c71..36b3a4b 100644
--- a/cgsmiles/dialects.py
+++ b/cgsmiles/dialects.py
@@ -89,8 +89,9 @@ def _parse_dialect_string(string_iterable,
     # if there are kwargs we need to put them into
     # output dict
     out_args = {}
-    out_args.update(applied_labels.arguments['kwargs'])
-    del applied_labels.arguments['kwargs']
+    if 'kwargs' in applied_labels.arguments:
+        out_args.update(applied_labels.arguments['kwargs'])
+        del applied_labels.arguments['kwargs']
     out_args.update(applied_labels.arguments)
     return out_args
 
@@ -128,5 +129,5 @@ def create_dialect(default_attributes, accept_kwargs=True):
 # they go to the respective parser
 # in case of cgsmiles fragments it is a bit doing
 # double the work
-fragment_base = create_dialect({"w": 1.0})
+fragment_base = create_dialect({"w": 1.0}, accept_kwargs=True)
 _fragment_node_parser = partial(_parse_dialect_string, dialect_signature=fragment_base)
diff --git a/cgsmiles/graph_utils.py b/cgsmiles/graph_utils.py
index 64c384f..6a39ea2 100644
--- a/cgsmiles/graph_utils.py
+++ b/cgsmiles/graph_utils.py
@@ -146,7 +146,6 @@ def annotate_fragments(meta_graph, molecule):
 
     return meta_graph
 
-
 def set_atom_names_atomistic(molecule, meta_graph=None):
     """
     Set atomnames according to commonly used convention
diff --git a/cgsmiles/pysmiles_utils.py b/cgsmiles/pysmiles_utils.py
index 9b66e6d..61ace56 100644
--- a/cgsmiles/pysmiles_utils.py
+++ b/cgsmiles/pysmiles_utils.py
@@ -32,7 +32,7 @@ def compute_mass(input_molecule):
         mass += pysmiles.PTE[element]['AtomicMass']
     return mass
 
-def rebuild_h_atoms(mol_graph, keep_bonding=False):
+def rebuild_h_atoms(mol_graph, copy_attrs=['fragid', 'fragname', 'w']):
     """
     Helper function which add hydrogen atoms to the molecule graph.
 
@@ -48,21 +48,20 @@ def rebuild_h_atoms(mol_graph, keep_bonding=False):
     The molecule graph is updated in place with the hydrogen atoms
     that are missing.
 
-    Using the keep_bonding argument the hydrogen count is reduced
-    by the number of bonding descriptors. In this way hydrogen
-    atoms can also be added to fragments only.
+    The `copy_attrs` argument defines a list of attributes to copy
+    to the newly added hydrogen atoms. In case the hydrogen atoms
+    are their own fragments attributes are not copied. If an attribute
+    is already assigned, because the hydrogen atom was explicit that
+    attribute is not replaced.
 
     Parameters
     ----------
     mol_graph: :class:`nx.Graph`
         graph describing the full molecule without hydrogen atoms
+    copy_attrs: list[abc.hashable]
+        a list of attributes to copy from the parent node to the
+        hydrogen atom
     """
-    for node in mol_graph.nodes:
-
-        if mol_graph.nodes[node].get('bonding', False) and  \
-            mol_graph.nodes[node].get('element', '*') == "H":
-            mol_graph.nodes[node]['single_h_frag'] = True
-
     try:
         pysmiles.smiles_helper.correct_aromatic_rings(mol_graph, strict=True)
     except SyntaxError as pysmiles_err:
@@ -79,22 +78,14 @@ def rebuild_h_atoms(mol_graph, keep_bonding=False):
     pysmiles.smiles_helper.fill_valence(mol_graph, respect_hcount=False)
     pysmiles.smiles_helper.add_explicit_hydrogens(mol_graph)
 
-    for node in mol_graph.nodes:
-        if mol_graph.nodes[node].get("element", "*") == "H" and\
-        not mol_graph.nodes[node].get("single_h_frag", False):
-            ref_node = next(mol_graph.neighbors(node))
-            mol_graph.nodes[node]["fragid"] = mol_graph.nodes[ref_node]["fragid"]
-            mol_graph.nodes[node]["fragname"] = mol_graph.nodes[ref_node]["fragname"]
-        if mol_graph.nodes[node].get("element", "*") == "H":
-            anchor = list(mol_graph.neighbors(node))[0]
-            # the weight for the hydrogen atom was explicitly set
-            hweights = mol_graph.nodes[anchor].get('hweight', [])
-            if hweights:
-                weight = hweights.pop()
-            # make sure the weights are copied for implicit h-atoms
-            else:
-                weight = mol_graph.nodes[anchor].get("w", 1)
-            mol_graph.nodes[node]["w"] = weight
+    for node, element in mol_graph.nodes(data='element'):
+        if element == "H" and not mol_graph.nodes[node].get("single_h_frag", False):
+            anchor = next(mol_graph.neighbors(node))
+            for attr in copy_attrs:
+                if attr in mol_graph.nodes[node]:
+                    continue
+                value = mol_graph.nodes[anchor][attr]
+                mol_graph.nodes[node][attr] = value
 
 def annotate_ez_isomers(molecule):
     """
@@ -177,3 +168,97 @@ def mark_chiral_atoms(molecule):
             neighbours = [neighbours[0],  neighbours[1], neighbours[3], neighbours[2]]
 
         molecule.nodes[node]['rs_isomer'] = tuple(neighbours)
+
+def read_fragment_smiles(smiles_str,
+                         fragname,
+                         bonding_descrpt={},
+                         rs_isomers={},
+                         ez_isomers={},
+                         attributes={}):
+    """
+    Read a smiles_str corresponding to a CGSmiles fragment and
+    annotate bonding descriptors, isomers, as well as any other
+    attributes.
+
+    This function also sets default attributes as follows:
+
+    - fragname to `fragname`
+    - fragid to 0
+    - w to 1
+
+    Parameters
+    ----------
+    smiles_str: str
+        string in OpenSMILES format
+    fragname: str
+        the name of the fragment
+    rs_isomers: dict
+    ez_isomers: dict
+    attributes: dict
+
+    Returns
+    -------
+    nx.Graph
+        the graph of the molecular fragment
+    """
+    if smiles_str == 'H':
+        LOGGER.warning("You define an H fragment, which is not valid SMILES. We'll make it [H].")
+        smiles_str = '[H]'
+
+    mol_graph = pysmiles.read_smiles(smiles_str,
+                                     explicit_hydrogen=True,
+                                     reinterpret_aromatic=False,
+                                     strict=False)
+    # set some default values
+    nx.set_node_attributes(mol_graph, fragname, 'fragname')
+    nx.set_node_attributes(mol_graph, 0, 'fragid')
+    nx.set_node_attributes(mol_graph, 1, 'w')
+
+    # we add all bonding descriptors to the molecule
+    nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding')
+
+    # set other attributes
+    nx.set_node_attributes(mol_graph, attributes)
+
+    # set the default atomnames consiting of the element and index
+    atomnames = {node[0]: node[1]['element']+str(node[0]) for node in mol_graph.nodes(data=True)}
+    nx.set_node_attributes(mol_graph, atomnames, 'atomname')
+
+    # we have just a single atom so no need for any annotations
+    if len(mol_graph) == 1:
+        # we set the hcount for all non-hydrogen elements
+        if mol_graph.nodes[0]['element'] != 'H':
+            mol_graph.nodes[0]['hcount'] = 0
+        # we tag all single h-atoms
+        else:
+            mol_graph.nodes[0]['single_h_frag'] = True
+        return mol_graph
+
+    # we need to remove hydrogen atoms except when they are having
+    # attributes; in this case we need to keep them
+    hatoms = set([n for n, e in mol_graph.nodes(data='element') if e == 'H'])
+    hatoms_to_keep = set(attributes.keys()) & hatoms
+
+    # temp fix until pysmiles util is imporved
+    # we set the element to z so they are ignored when pysmiles removes hatoms
+    nx.set_node_attributes(mol_graph,
+                           dict(zip(hatoms_to_keep, len(hatoms_to_keep)*'z')),
+                           'element')
+
+    pysmiles.remove_explicit_hydrogens(mol_graph)
+
+    # now we reset the hatoms
+    nx.set_node_attributes(mol_graph,
+                           dict(zip(hatoms_to_keep, len(hatoms_to_keep)*'H')),
+                           'element')
+
+    # annotate rs isomers
+    nx.set_node_attributes(mol_graph, rs_isomers, 'rs_isomer')
+
+    # we need to split countable node keys and the associated value
+    ez_isomer_atoms = {idx: val[:-1] for idx, val in ez_isomers.items()}
+    ez_isomer_class = {idx: val[-1] for idx, val in ez_isomers.items()}
+    nx.set_node_attributes(mol_graph, ez_isomer_atoms, 'ez_isomer_atoms')
+    nx.set_node_attributes(mol_graph, ez_isomer_class, 'ez_isomer_class')
+
+    return mol_graph
diff --git a/cgsmiles/read_fragments.py b/cgsmiles/read_fragments.py
index 779e7a7..16e782a 100644
--- a/cgsmiles/read_fragments.py
+++ b/cgsmiles/read_fragments.py
@@ -7,6 +7,8 @@
 import pysmiles
 from .read_cgsmiles import read_cgsmiles
 from .dialects import _fragment_node_parser
+from .pysmiles_utils import read_fragment_smiles
+from .cgsmiles_utils import read_fragment_cgsmiles
 
 logger = logging.getLogger('pysmiles')
 logger.setLevel(level=logging.ERROR)
@@ -129,7 +131,6 @@ def strip_bonding_descriptors(fragment_string):
     rs_isomers = {}
     attributes = defaultdict(dict)
     record_attributes = False
-    hydrogen_weights = defaultdict(list)
     smile = ""
     node_count = 0
     prev_node = 0
@@ -176,12 +177,6 @@ def strip_bonding_descriptors(fragment_string):
                 # here we do some post processing cleanup
                 node_attributes = _fragment_node_parser(attribute_str)
                 attributes[node_count].update(node_attributes)
-                # hydrogen atoms are implicit so we filter
-                # them out here
-                if atom[1:] == 'H' and node_count == 0:
-                    hydrogen_weights[1].append(attributes[node_count]['w'])
-                elif atom[1:] == 'H':
-                    hydrogen_weights[prev_node].append(attributes[node_count]['w'])
 
                 smile = smile + atom + "]"
                 prev_node = node_count
@@ -219,8 +214,6 @@ def strip_bonding_descriptors(fragment_string):
                 smile += token
             current_order = None
             prev_node = node_count
-            # set default weight
-            attributes[node_count]['w'] = 1
             node_count += 1
 
     # we need to annotate rings to the chiral isomers
@@ -230,7 +223,7 @@ def strip_bonding_descriptors(fragment_string):
                 bonded_node = _find_bonded_ring_node(ring_nodes, node)
                 rs_isomers[node][1].append(bonded_node)
 
-    return smile, bonding_descrpt, rs_isomers, ez_isomer_atoms, attributes, hydrogen_weights
+    return smile, bonding_descrpt, rs_isomers, ez_isomer_atoms, attributes
 
 def fragment_iter(fragment_str, all_atom=True):
     """
@@ -260,39 +253,21 @@ def fragment_iter(fragment_str, all_atom=True):
         delim = fragment.find('=', 0)
         fragname = fragment[1:delim]
         frag_smile = fragment[delim+1:]
-        smile, bonding_descrpt, rs_isomers, ez_isomers, attributes, h_weights = strip_bonding_descriptors(frag_smile)
-        if smile == "H":
-            mol_graph = nx.Graph()
-            mol_graph.add_node(0, element="H", bonding=bonding_descrpt[0])
-            nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding')
-        elif all_atom:
-            mol_graph = pysmiles.read_smiles(smile,
-                                             reinterpret_aromatic=False,
-                                             strict=False)
-            nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding')
-            nx.set_node_attributes(mol_graph, rs_isomers, 'rs_isomer')
-            # we need to split countable node keys and the associated value
-            ez_isomer_atoms = {idx: val[:-1] for idx, val in ez_isomers.items()}
-            ez_isomer_class = {idx: val[-1] for idx, val in ez_isomers.items()}
-            nx.set_node_attributes(mol_graph, ez_isomer_atoms, 'ez_isomer_atoms')
-            nx.set_node_attributes(mol_graph, ez_isomer_class, 'ez_isomer_class')
-            # set the hydrogen weight attribute
-            nx.set_node_attributes(mol_graph, h_weights, 'hweight')
+        smiles_str, bonding_descrpt, rs_isomers, ez_isomers, attributes = strip_bonding_descriptors(frag_smile)
+        # read an all_atom fragment using OpenSMILES definition
+        if all_atom:
+            mol_graph = read_fragment_smiles(smiles_str,
+                                             fragname,
+                                             bonding_descrpt,
+                                             rs_isomers,
+                                             ez_isomers,
+                                             attributes)
         # we deal with a CG resolution graph
         else:
-            mol_graph = read_cgsmiles(smile)
-            fragnames = nx.get_node_attributes(mol_graph, 'fragname')
-            nx.set_node_attributes(mol_graph, fragnames, 'atomname')
-            nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding')
-
-        if all_atom:
-            atomnames = {node[0]: node[1]['element']+str(node[0]) for node in mol_graph.nodes(data=True)}
-            nx.set_node_attributes(mol_graph, atomnames, 'atomname')
-
-        nx.set_node_attributes(mol_graph, fragname, 'fragname')
-        nx.set_node_attributes(mol_graph, 0, 'fragid')
-        # set other attributes
-        nx.set_node_attributes(mol_graph, attributes)
+            mol_graph = read_fragment_cgsmiles(smiles_str,
+                                               fragname,
+                                               bonding_descrpt,
+                                               attributes)
         yield fragname, mol_graph
 
 def read_fragments(fragment_str, all_atom=True, fragment_dict=None):
diff --git a/cgsmiles/tests/test_cgsmile_parsing.py b/cgsmiles/tests/test_cgsmile_parsing.py
index d2391de..c62f3f7 100644
--- a/cgsmiles/tests/test_cgsmile_parsing.py
+++ b/cgsmiles/tests/test_cgsmile_parsing.py
@@ -270,14 +270,13 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
         set_charges = nx.get_node_attributes(meta_mol, 'c')
         assert set_charges == charges
 
-@pytest.mark.parametrize('big_smile, smile, bonding, rs, ez, attrs, hweights',(
+@pytest.mark.parametrize('big_smile, smile, bonding, rs, ez, attrs',(
   # smiple symmetric bonding
                         ("[$]COC[$]",
                          "COC",
                         {0: ["$1"], 2: ["$1"]},
                         None,
                         None,
-                        None,
                         None),
                         # smiple symmetric bonding with weight
                         ("[$]C[O;0.5]C[$]",
@@ -285,16 +284,14 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
                         {0: ["$1"], 2: ["$1"]},
                         None,
                         None,
-                        {'w': {0: 1, 1: 0.5, 2: 1}},
-                        None),
+                        {'w': {1: 0.5}}),
                         # smiple kwarg not part of the defaults
                         ("[$]C[O;q=4;p=s][C;q=3;p=l][$]",
                          "C[O][C]",
                         {0: ["$1"], 2: ["$1"]},
                         None,
                         None,
-                        {'q': {1: '4', 2: '3'}, 'p': {1: 's', 2: 'l'}},
-                        None),
+                        {'q': {1: '4', 2: '3'}, 'p': {1: 's', 2: 'l'}}),
                         # smiple symmetric bonding with weight
                         # using cgsmiles string
                         ("[$][#TC4][#OT1;0.5][#CD1][$]",
@@ -302,17 +299,15 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
                         {0: ["$1"], 2: ["$1"]},
                         None,
                         None,
-                        {'w': {0: 1, 1: 0.5, 2: 1}},
-                        None),
-                      # # smiple symmetric bonding with random
-                      # # keyword argument
-                      # ("[$][#TC4][#OT1;r=abc][#CD1][$]",
-                      #  "[#TC4][#OT1][#CD1]",
-                      # {0: ["$1"], 2: ["$1"]},
-                      # None,
-                      # None,
-                      # {'w': {0: 1, 1: 1, 2: 1}, 'r': {1: 'abc'}},
-                      # None),
+                        {'w': {1: 0.5}}),
+                        # smiple symmetric bonding with random
+                        # keyword argument
+                        ("[$][#TC4][#OT1;r=abc][#CD1][$]",
+                         "[#TC4][#OT1][#CD1]",
+                        {0: ["$1"], 2: ["$1"]},
+                        None,
+                        None,
+                        {'r': {1: 'abc'}}),
                         # smiple symmetric bonding with weight
                         # using open smiles and hweights
                         ("[$]CO[C;0.5][$]([H;0.1])[H;0.2]",
@@ -320,23 +315,20 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
                         {0: ["$1"], 2: ["$1"]},
                         None,
                         None,
-                        {'w':{0: 1, 1: 1, 2: 0.5}},
-                        {2: [0.1, 0.2]}),
+                        {'w': {2: 0.5, 3: 0.1, 4: 0.2}}),
                         # H atom with weight goes first
                         ("[H;0.3]C[$]O[C;0.5][$]",
                          "[H]CO[C]",
                         {1: ["$1"], 3: ["$1"]},
                         None,
                         None,
-                        {'w': {1: 1, 2: 1, 3: 0.5}},
-                        {1: [0.3]}),
+                        {'w': {0: 0.3, 3: 0.5}}),
                         # smiple symmetric bonding with more than one name
                         ("[$1A]COC[$1A]",
                          "COC",
                         {0: ["$1A1"], 2: ["$1A1"]},
                         None,
                         None,
-                        None,
                         None),
                         # smiple bonding multiletter atom
                         ("Clc[$]c[$]",
@@ -344,7 +336,6 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
                         {1: ["$1"], 2: ["$1"]},
                         None,
                         None,
-                        None,
                         None),
                         # simple symmetric but with explicit hydrogen
                         ("[$][CH2]O[CH2][$]",
@@ -352,7 +343,6 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
                         {0: ["$1"], 2: ["$1"]},
                         None,
                         None,
-                        None,
                         None),
                         # smiple symmetric bonding; multiple descript
                         ("[$]COC[$][$1]",
@@ -360,7 +350,6 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
                         {0: ["$1"], 2: ["$1", "$11"]},
                         None,
                         None,
-                        None,
                         None),
                         # named different bonding descriptors
                         ("[$1]CCCC[$2]",
@@ -368,7 +357,6 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
                         {0: ["$11"], 3: ["$21"]},
                         None,
                         None,
-                        None,
                         None),
                         # ring and bonding descriptors
                         ("[$1]CC[$2]C1CCCCC1",
@@ -376,7 +364,6 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
                         {0: ["$11"], 1: ["$21"]},
                         None,
                         None,
-                        None,
                         None),
                         # bonding descript. after branch
                         ("C(COC[$1])[$2]CCC[$3]",
@@ -384,7 +371,6 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
                         {0: ["$21"], 3: ["$11"], 6: ["$31"]},
                         None,
                         None,
-                        None,
                         None),
                         # left rigth bonding desciptors
                         ("[>]COC[<]",
@@ -392,7 +378,6 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
                         {0: [">1"], 2: ["<1"]},
                         None,
                         None,
-                        None,
                         None),
                         # simple chirality in residue
                         ("[>]C[C@](F)(B)N[<]",
@@ -400,7 +385,6 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
                         {0: [">1"], 4: ["<1"]},
                         {1: ('@', [])},
                         None,
-                        None,
                         None),
                         # simple chirality inverse in residue
                         ("[>]C[C@@](F)(B)N[<]",
@@ -408,7 +392,6 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
                         {0: [">1"], 4: ["<1"]},
                         {1: ('@@', [])},
                         None,
-                        None,
                         None),
                         # \ fragment split
                         ("[>]CC(\F)=[<]",
@@ -416,7 +399,6 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
                         {0: [">1"], 1: ["<2"]},
                         None,
                         {2: (2, 1, '\\')},
-                        None,
                         None),
                         # / fragment split
                         ("[>]CC(/F)=[<]",
@@ -424,7 +406,6 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
                         {0: [">1"], 1: ["<2"]},
                         None,
                         {2: (2, 1, '/')},
-                        None,
                         None),
                         # both in one fragment
                         ("[>]CC(/F)=C(\F)C[<]",
@@ -432,11 +413,10 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
                         {0: [">1"], 5: ["<1"]},
                         None,
                         {2: (2, 1, '/'), 4: (4, 3, '\\')},
-                        None,
                         None),
 ))
-def test_strip_bonding_descriptors(big_smile, smile, bonding, rs, ez, attrs, hweights):
-    new_smile, new_bonding, rs_isomers, ez_isomers, attrs_out, hweights_out = strip_bonding_descriptors(big_smile)
+def test_strip_bonding_descriptors(big_smile, smile, bonding, rs, ez, attrs):
+    new_smile, new_bonding, rs_isomers, ez_isomers, attrs_out = strip_bonding_descriptors(big_smile)
     assert new_smile == smile
     assert new_bonding == bonding
     if rs:
@@ -451,9 +431,6 @@ def test_strip_bonding_descriptors(big_smile, smile, bonding, rs, ez, attrs, hwe
         for attr, node_attrs in attrs.items():
             for node, value in node_attrs.items():
                 assert attrs_out[node][attr] == value
-    if hweights:
-        for node, weight in hweights.items():
-            assert hweights_out[node] == weight
 
 @pytest.mark.parametrize('fragment_str, nodes, edges',(
                         # single fragment
diff --git a/cgsmiles/tests/test_molecule_resolve.py b/cgsmiles/tests/test_molecule_resolve.py
index 4e62d6f..db7b1cc 100644
--- a/cgsmiles/tests/test_molecule_resolve.py
+++ b/cgsmiles/tests/test_molecule_resolve.py
@@ -42,7 +42,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
 
 @pytest.mark.parametrize('smile, ref_frags, elements, ref_edges, chiral, ez, weights',(
                         # smiple linear seqeunce
-                        ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]COC[$],#OHter=[$][O]}",
+                        ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]COC[$],#OHter=[$]O}",
                         #           0 1             2 3 4 5 6 7 8
                         [('OHter', 'O H'), ('PEO', 'C O C H H H H'),
                         #        9 10 11 12 13 14 15         16 17
@@ -63,7 +63,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
                          (6, 8), (7, 9), (7, 11), (10, 11), (10, 12), (10, 13),
                          (10, 14), (11, 15)], {}, {}, {}),
                         # smiple linear seqeunce unconsumed bonding descrpt
-                        ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]CO[>]C[$],#OHter=[$][O]}",
+                        ("{[#OHter][#PEO]|2[#OHter]}.{#PEO=[$]CO[>]C[$],#OHter=[$]O}",
                         #           0 1             2 3 4 5 6 7 8
                         [('OHter', 'O H'), ('PEO', 'C O C H H H H'),
                         #        9 10 11 12 13 14 15         16 17
@@ -115,7 +115,7 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
                         # something with a ring
                         #            012 34567
                         #            890123456
-                        ("{[#Hter][#PS]|2[#Hter]}.{#PS=[$]CC[$]c1ccccc1,#Hter=[$]H}",
+                        ("{[#Hter][#PS]|2[#Hter]}.{#PS=[$]CC[$]c1ccccc1,#Hter=[$][H]}",
                         [('Hter', 'H'), ('PS', 'C C C C C C C C H H H H H H H H'),
                          ('PS', 'C C C C C C C C H H H H H H H H'), ('Hter', 'H')],
                         'H C C C C C C C C H H H H H H H H C C C C C C C C H H H H H H H H H',
@@ -273,14 +273,13 @@ def test_match_bonding_descriptors(bonds_source, bonds_target, edge, btypes):
                         {},{}, {0: 0.5, 1: 0.1, 5: 0.1, 4: 0.5, 8: 0.5, 9: 0.1, 12: 0.5, 13: 0.1}),
                         # test 2 weights and hydrogen weights
                         ("{[#SP4]1[#SP4][#SP1r]1}.{#SP4=[O;0.5]([H;0.2])[C;0.1][$]C[$]O,#SP1r=[$]OC[$]CO}",
-                        [('SP4', 'O C C O H H H H'), ('SP4', 'O C C O H H H H'),
+                        [('SP4', 'O H C C O H H H'), ('SP4', 'O H C C O H H H'),
                          ('SP1r', 'O C C O H H H H')],
-                        'O C C O H H H H O C C O H H H H O C C O H H H H',
-                        [(0, 1), (0, 4), (1, 2), (1, 9), (1, 5), (2, 3), (2, 16), (2, 6),
-                         (3, 7), (8, 9), (8, 12), (9, 10), (9, 13), (10, 11), (10, 17),
-                         (10, 14), (11, 15), (16, 17), (17, 18), (17, 20), (18, 19),
-                         (18, 21), (18, 22), (19, 23)],
-                        {},{}, {0: 0.5, 1: 0.1, 5: 0.1, 4: 0.2, 8: 0.5, 9: 0.1, 12: 0.2, 13: 0.1}),
+                        'O H C C O H H H O H C C O H H H O C C O H H H H',
+                        [(0, 1), (0, 2), (2, 3), (2, 10), (2, 5), (3, 4), (3, 16), (3, 6), (4, 7), (8, 9),
+                         (8, 10), (10, 11), (10, 13), (11, 12), (11, 17), (11, 14), (12, 15), (16, 17),
+                         (17, 18), (17, 20), (18, 19), (18, 21), (18, 22), (19, 23)],
+                        {},{}, {0: 0.5, 1: 0.2, 2: 0.1, 5: 0.1, 8: 0.5, 9: 0.2, 10: 0.1, 13: 0.1}),
 ))
 def test_all_atom_resolve_molecule(smile, ref_frags, elements, ref_edges, chiral, ez, weights):
     meta_mol, molecule = MoleculeResolver.from_string(smile).resolve()

From 4e508f9f4ddebf60f88e000b03a1f42fd829805b Mon Sep 17 00:00:00 2001
From: Fabian Gruenewald <f.grunewald@rug.nl>
Date: Mon, 18 Nov 2024 12:19:16 +0100
Subject: [PATCH 15/16] change c to q and use fullnames for weight and charge

---
 cgsmiles/dialects.py                    | 25 +++++++++++--
 cgsmiles/pysmiles_utils.py              |  4 +-
 cgsmiles/tests/test_cgsmile_parsing.py  | 50 ++++++++++++-------------
 cgsmiles/tests/test_molecule_resolve.py |  2 +-
 4 files changed, 50 insertions(+), 31 deletions(-)

diff --git a/cgsmiles/dialects.py b/cgsmiles/dialects.py
index 36b3a4b..81b3687 100644
--- a/cgsmiles/dialects.py
+++ b/cgsmiles/dialects.py
@@ -17,6 +17,7 @@ def check_and_cast_types(bound_args, signature):
 
 def _parse_dialect_string(string_iterable,
                           dialect_signature,
+                          arg_to_fullname={},
                           annotation_sep_token=';',
                           annotation_assign_token='='):
     """
@@ -39,6 +40,8 @@ def _parse_dialect_string(string_iterable,
     dialect_signature: cls.inspec.Signature
         a signature defineing args, kwargs, default values
         and types
+    arg_to_fullname: dict
+        maps arguments to more verbose descriptions
     annotation_sep_token: str
         character used to seperate key value pairs
     annotation_assign_token: str
@@ -86,6 +89,18 @@ def _parse_dialect_string(string_iterable,
     applied_labels = check_and_cast_types(applied_labels,
                                           dialect_signature)
     applied_labels.apply_defaults()
+    # convert keys to more verbose names
+    # this should only apply to args know to
+    # the signature
+    remove_keys = []
+    for old_key, new_key in arg_to_fullname.items():
+        if old_key in applied_labels.arguments:
+            applied_labels.arguments[new_key] = applied_labels.arguments[old_key]
+            remove_keys.append(old_key)
+
+    for key in remove_keys:
+        del applied_labels.arguments[key]
+
     # if there are kwargs we need to put them into
     # output dict
     out_args = {}
@@ -120,9 +135,11 @@ def create_dialect(default_attributes, accept_kwargs=True):
 # this one is for global use
 # it is the base CGSmiles dialect
 CGSMILES_DEFAULT_DIALECT = create_dialect({"fragname": "NaN",
-                                           "c": 0.0,
+                                           "q": 0.0,
                                            "w": 1.0})
-parse_graph_base_node = partial(_parse_dialect_string, dialect_signature=CGSMILES_DEFAULT_DIALECT)
+parse_graph_base_node = partial(_parse_dialect_string,
+                                dialect_signature=CGSMILES_DEFAULT_DIALECT,
+                                arg_to_fullname = {"w": "weight", "q": "charge"})
 # this one is an internal fukery until the pysmiles
 # base parser is available
 # it just strips the kwargs from fragments before
@@ -130,4 +147,6 @@ def create_dialect(default_attributes, accept_kwargs=True):
 # in case of cgsmiles fragments it is a bit doing
 # double the work
 fragment_base = create_dialect({"w": 1.0}, accept_kwargs=True)
-_fragment_node_parser = partial(_parse_dialect_string, dialect_signature=fragment_base)
+_fragment_node_parser = partial(_parse_dialect_string,
+                                dialect_signature=fragment_base,
+                                arg_to_fullname = {"w": "weight"})
diff --git a/cgsmiles/pysmiles_utils.py b/cgsmiles/pysmiles_utils.py
index 61ace56..69f76ab 100644
--- a/cgsmiles/pysmiles_utils.py
+++ b/cgsmiles/pysmiles_utils.py
@@ -32,7 +32,7 @@ def compute_mass(input_molecule):
         mass += pysmiles.PTE[element]['AtomicMass']
     return mass
 
-def rebuild_h_atoms(mol_graph, copy_attrs=['fragid', 'fragname', 'w']):
+def rebuild_h_atoms(mol_graph, copy_attrs=['fragid', 'fragname', 'weight']):
     """
     Helper function which add hydrogen atoms to the molecule graph.
 
@@ -212,7 +212,7 @@ def read_fragment_smiles(smiles_str,
     # set some default values
     nx.set_node_attributes(mol_graph, fragname, 'fragname')
     nx.set_node_attributes(mol_graph, 0, 'fragid')
-    nx.set_node_attributes(mol_graph, 1, 'w')
+    nx.set_node_attributes(mol_graph, 1, 'weight')
 
     # we add all bonding descriptors to the molecule
     nx.set_node_attributes(mol_graph, bonding_descrpt, 'bonding')
diff --git a/cgsmiles/tests/test_cgsmile_parsing.py b/cgsmiles/tests/test_cgsmile_parsing.py
index c62f3f7..ced9802 100644
--- a/cgsmiles/tests/test_cgsmile_parsing.py
+++ b/cgsmiles/tests/test_cgsmile_parsing.py
@@ -17,7 +17,7 @@
                         [(0, 1), (1, 2)],
                         [1, 1]),
                         # smiple charges with keyword
-                        ("{[#PMA;c=+1][#PEO][#PMA;c=-0.25]}",
+                        ("{[#PMA;q=+1][#PEO][#PMA;q=-0.25]}",
                         ["PMA", "PEO", "PMA"],
                         {0: 1.0, 1: 0.0, 2:-0.25},
                         [(0, 1), (1, 2)],
@@ -267,7 +267,7 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
     assert nodes == list(fragnames.values())
 
     if charges:
-        set_charges = nx.get_node_attributes(meta_mol, 'c')
+        set_charges = nx.get_node_attributes(meta_mol, 'charge')
         assert set_charges == charges
 
 @pytest.mark.parametrize('big_smile, smile, bonding, rs, ez, attrs',(
@@ -284,7 +284,7 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
                         {0: ["$1"], 2: ["$1"]},
                         None,
                         None,
-                        {'w': {1: 0.5}}),
+                        {'weight': {1: 0.5}}),
                         # smiple kwarg not part of the defaults
                         ("[$]C[O;q=4;p=s][C;q=3;p=l][$]",
                          "C[O][C]",
@@ -299,7 +299,7 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
                         {0: ["$1"], 2: ["$1"]},
                         None,
                         None,
-                        {'w': {1: 0.5}}),
+                        {'weight': {1: 0.5}}),
                         # smiple symmetric bonding with random
                         # keyword argument
                         ("[$][#TC4][#OT1;r=abc][#CD1][$]",
@@ -315,14 +315,14 @@ def test_read_cgsmiles(smile, nodes, charges, edges, orders):
                         {0: ["$1"], 2: ["$1"]},
                         None,
                         None,
-                        {'w': {2: 0.5, 3: 0.1, 4: 0.2}}),
+                        {'weight': {2: 0.5, 3: 0.1, 4: 0.2}}),
                         # H atom with weight goes first
                         ("[H;0.3]C[$]O[C;0.5][$]",
                          "[H]CO[C]",
                         {1: ["$1"], 3: ["$1"]},
                         None,
                         None,
-                        {'w': {0: 0.3, 3: 0.5}}),
+                        {'weight': {0: 0.3, 3: 0.5}}),
                         # smiple symmetric bonding with more than one name
                         ("[$1A]COC[$1A]",
                          "COC",
@@ -435,50 +435,50 @@ def test_strip_bonding_descriptors(big_smile, smile, bonding, rs, ez, attrs):
 @pytest.mark.parametrize('fragment_str, nodes, edges',(
                         # single fragment
                         ("{#PEO=[$]COC[$]}",
-                        {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "w": 1}),
-                                 (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "w": 1}),
-                                 (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "w": 1}),
+                        {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "weight": 1}),
+                                 (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "weight": 1}),
+                                 (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "weight": 1}),
                                 )},
                         {"PEO": [(0, 1), (1, 2)]}),
                         # single fragment but with explicit hydrogen in smiles
                         ("{#PEO=[$][CH2]O[CH2][$]}",
-                        {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2, "w": 1}),
-                                 (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "w": 1}),
-                                 (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2, "w": 1}),
+                        {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2, "weight": 1}),
+                                 (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "weight": 1}),
+                                 (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2, "weight": 1}),
                                 )},
                         {"PEO": [(0, 1), (1, 2)]}),
                         # test NH3 terminal
                         ("{#AMM=N[$]}",
-                        {"AMM": ((0, {"atomname": "N0", "fragname": "AMM", "bonding": ["$1"], "element": "N", "hcount": 3, "w": 1}),
+                        {"AMM": ((0, {"atomname": "N0", "fragname": "AMM", "bonding": ["$1"], "element": "N", "hcount": 3, "weight": 1}),
                                 )},
                         {"AMM": []}),
                         # single fragment + 1 terminal (i.e. only 1 bonding descrpt
                         ("{#PEO=[$]COC[$],#OHter=[$][OH]}",
-                        {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "w": 1}),
-                                 (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "w": 1}),
-                                 (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "w": 1}),
+                        {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "weight": 1}),
+                                 (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "weight": 1}),
+                                 (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "weight": 1}),
                                  ),
-                         "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "w": 1}),)},
+                         "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "weight": 1}),)},
                         {"PEO": [(0, 1), (1, 2)],
                          "OHter": []}),
                         # single fragment + 1 terminal but multiple bond descritp.
                         # this adjust the hydrogen count
                         ("{#PEO=[$]COC[$][$1],#OHter=[$][OH]}",
-                        {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "w": 1}),
-                                 (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "w": 1}),
-                                 (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1", "$11"], "element": "C", "hcount": 3, "w": 1}),
+                        {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 3, "weight": 1}),
+                                 (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "weight": 1}),
+                                 (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1", "$11"], "element": "C", "hcount": 3, "weight": 1}),
                                  ),
-                         "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "hcount": 1, "w": 1}),)},
+                         "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "hcount": 1, "weight": 1}),)},
                         {"PEO": [(0, 1), (1, 2)],
                          "OHter": []}),
                         # single fragment + 1 terminal but multiple bond descritp.
                         # but explicit hydrogen in the smiles string
                         ("{#PEO=[$][CH2]O[CH2][$][$1],#OHter=[$][OH]}",
-                        {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2, "w": 1}),
-                                 (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "w": 1}),
-                                 (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1", "$11"], "element": "C", "hcount": 2, "w": 1}),
+                        {"PEO": ((0, {"atomname": "C0", "fragname": "PEO", "bonding": ["$1"], "element": "C", "hcount": 2, "weight": 1}),
+                                 (1, {"atomname": "O1", "fragname": "PEO", "element": "O", "hcount": 0, "weight": 1}),
+                                 (2, {"atomname": "C2", "fragname": "PEO", "bonding": ["$1", "$11"], "element": "C", "hcount": 2, "weight": 1}),
                                  ),
-                         "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "hcount": 1, "w": 1}),
+                         "OHter": ((0, {"atomname": "O0", "fragname": "OHter", "bonding": ["$1"], "element": "O", "hcount": 1, "weight": 1}),
                                    )},
                         {"PEO": [(0, 1), (1, 2),],
                          "OHter": []}),
diff --git a/cgsmiles/tests/test_molecule_resolve.py b/cgsmiles/tests/test_molecule_resolve.py
index db7b1cc..ebc8299 100644
--- a/cgsmiles/tests/test_molecule_resolve.py
+++ b/cgsmiles/tests/test_molecule_resolve.py
@@ -319,7 +319,7 @@ def _ele_match(n1, n2):
     if weights:
         mol_weights = {node: 1 for node in ref_graph}
         mol_weights.update(weights)
-        weights_assigned = nx.get_node_attributes(molecule, 'w')
+        weights_assigned = nx.get_node_attributes(molecule, 'weight')
         assert mol_weights == weights_assigned
 
 @pytest.mark.parametrize('case, cgsmiles_str, ref_string',(

From 64027bcbdda4dd32bced4ef3a0b07550550beaa7 Mon Sep 17 00:00:00 2001
From: "Dr. Fabian Grunewald" <32294573+fgrunewald@users.noreply.github.com>
Date: Fri, 22 Nov 2024 14:13:32 +0100
Subject: [PATCH 16/16] Update cgsmiles/dialects.py

Co-authored-by: Peter C Kroon <pckroon@users.noreply.github.com>
---
 cgsmiles/dialects.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/cgsmiles/dialects.py b/cgsmiles/dialects.py
index 81b3687..5404c3b 100644
--- a/cgsmiles/dialects.py
+++ b/cgsmiles/dialects.py
@@ -92,14 +92,9 @@ def _parse_dialect_string(string_iterable,
     # convert keys to more verbose names
     # this should only apply to args know to
     # the signature
-    remove_keys = []
     for old_key, new_key in arg_to_fullname.items():
         if old_key in applied_labels.arguments:
-            applied_labels.arguments[new_key] = applied_labels.arguments[old_key]
-            remove_keys.append(old_key)
-
-    for key in remove_keys:
-        del applied_labels.arguments[key]
+            applied_labels.arguments[new_key] = applied_labels.arguments.pop(old_key)
 
     # if there are kwargs we need to put them into
     # output dict