Skip to content

Commit

Permalink
Merge pull request #24 from gruenewald-lab/CG_bond_orders
Browse files Browse the repository at this point in the history
implement bond order syntax for CG
  • Loading branch information
fgrunewald authored Oct 16, 2024
2 parents e398f3d + 8af9a51 commit ad9844c
Show file tree
Hide file tree
Showing 8 changed files with 205 additions and 63 deletions.
7 changes: 5 additions & 2 deletions cgsmiles/graph_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,8 +164,11 @@ def set_atom_names_atomistic(molecule, meta_graph=None):
fraglist = defaultdict(list)
if meta_graph:
for meta_node in meta_graph.nodes:
fraggraph = meta_graph.nodes[meta_node]['graph']
fraglist[meta_node] += list(fraggraph.nodes)
# to catch virtual side nodes that do not have a representation
# in the atomsitic structure
fraggraph = meta_graph.nodes[meta_node].get('graph', None)
if fraggraph:
fraglist[meta_node] += list(fraggraph.nodes)
else:
node_to_fragid = nx.get_node_attributes(molecule, 'fragid')
for node, fragids in node_to_fragid.items():
Expand Down
115 changes: 81 additions & 34 deletions cgsmiles/read_cgsmiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,32 +31,29 @@ def _expand_branch(mol_graph, current, anchor, recipe):
anchor: abc.hashable
anchor to which to connect current node
recipie: list[(str, int)]
recipe: list[(str, int, int)]
list storing tuples of node names and
the number of times the node has to be added
and their bond order
Returns
-------
nx.Graph
"""
prev_node = anchor
for bdx, (fragname, n_mon) in enumerate(recipe):
for bdx, (fragname, n_mon, order) in enumerate(recipe):
if bdx == 0:
anchor = current
for _ in range(0, n_mon):
mol_graph.add_node(current, fragname=fragname)
mol_graph.add_edge(prev_node, current, order=1)
mol_graph.add_edge(prev_node, current, order=order)

prev_node = current
current += 1

prev_node = anchor
return mol_graph, current, prev_node

def _get_percent(pattern, stop):
end_num = _find_next_character(pattern, ['[', ')', '(', '}'], stop)
return pattern[stop+1:end_num]

def read_cgsmiles(pattern):
"""
Generate a :class:`nx.Graph` from a pattern string according to the
Expand Down Expand Up @@ -133,6 +130,10 @@ def read_cgsmiles(pattern):
cycle_edges = []
# each element in the for loop matches a pattern
# '[' + '#' + some alphanumeric name + ']'
symbol_to_order = {".": 0, "=": 2, "-": 1, "#": 3, "$": 4}
default_bond_order = 1
bond_order = None
prev_bond_order = None
for match in re.finditer(PATTERNS['place_holder'], pattern):
start, stop = match.span()
# we start a new branch when the residue is preceded by '('
Expand All @@ -142,32 +143,61 @@ def read_cgsmiles(pattern):
branch_anchor.append(prev_node)
# the recipe for making the branch includes the anchor;
# which is hence the first residue in the list
recipes[branch_anchor[-1]] = [(mol_graph.nodes[prev_node]['fragname'], 1)]
# at this point the bond order is still 1 unless we have an expansion
recipes[branch_anchor[-1]] = [(mol_graph.nodes[prev_node]['fragname'], 1, 1)]

# here we check if the atom is followed by a cycle marker
# in this case we have an open cycle and close it
for token in pattern[stop:]:
# we close a cycle
if token.isdigit() and token in cycle:
cycle_edges.append((current, cycle[token]))
del cycle[token]
# we open a cycle
elif token.isdigit():
cycle[token] = current
# we found a ring indicator
elif token == "%":
ring_marker = _get_percent(pattern, stop)
# we close the ring
ring_marker = ""
multi_ring = False
ring_bond_order = default_bond_order
for rdx, token in enumerate(pattern[stop:]):
if multi_ring and not token.isdigit():
ring_marker = int(ring_marker[1:])
if ring_marker in cycle:
cycle_edges.append((current, cycle[ring_marker]))
cycle_edges.append((current,
cycle[ring_marker][0],
cycle[ring_marker][1]))
del cycle[ring_marker]
break
# we open a new ring
cycle[_get_percent(pattern, stop)] = current
break
else:
cycle[ring_marker] = [current, ring_bond_order]
multi_ring = False
ring_marker = ""
ring_bond_order = default_bond_order

# we open a new multi ring
if token == "%":
multi_ring = True
ring_marker = '%'
# we open a ring or close
elif token.isdigit():
ring_marker += token
if not multi_ring:
ring_marker = int(ring_marker)
# we have a single digit marker and it is in
# cycle so we close it
if ring_marker in cycle:
cycle_edges.append((current,
cycle[ring_marker][0],
cycle[ring_marker][1]))
del cycle[ring_marker]
# the marker is not in cycle so we update cycles
else:
cycle[ring_marker] = [current, ring_bond_order]
ring_marker = ""
ring_bond_order = default_bond_order
# we found bond_order
elif token in symbol_to_order:
ring_bond_order = symbol_to_order[token]
else:
break

# check if there is a bond-order following the node
if stop < len(pattern) and pattern[stop+rdx-1] in '- + . = # $':
bond_order = symbol_to_order[pattern[stop+rdx-1]]
else:
bond_order = default_bond_order

# here we check if the atom is followed by a expansion character '|'
# as in ... [#PEO]|
if stop < len(pattern) and pattern[stop] == '|':
Expand Down Expand Up @@ -199,24 +229,27 @@ def read_cgsmiles(pattern):
# the recipe dict together with the anchor residue
# and expansion number
if branching:
recipes[branch_anchor[-1]].append((fragname, n_mon))
recipes[branch_anchor[-1]].append((fragname, n_mon, prev_bond_order))

# new we add new residue as often as required
connection = []
for _ in range(0, n_mon):
mol_graph.add_node(current, fragname=fragname, charge=charge)

if prev_node is not None:
mol_graph.add_edge(prev_node, current, order=1)
mol_graph.add_edge(prev_node, current, order=prev_bond_order)

prev_bond_order = bond_order

# here we have a double edge
for cycle_edge in cycle_edges:
if cycle_edge in mol_graph.edges:
mol_graph.edges[cycle_edge]["order"] += 1
else:
mol_graph.add_edge(cycle_edge[0],
cycle_edge[1],
order=1)
msg=("You define two edges between the same node."
"Use bond order symbols instead.")
raise SyntaxError(msg)
mol_graph.add_edge(cycle_edge[0],
cycle_edge[1],
order=cycle_edge[2])

prev_node = current
current += 1
Expand Down Expand Up @@ -246,12 +279,19 @@ def read_cgsmiles(pattern):
eon_a = _find_next_character(pattern, [')'], stop)
# Then we check if the expansion character
# is next.
if eon_a+1 < len(pattern) and pattern[eon_a+1] == "|":
if (eon_a+1 < len(pattern) and pattern[eon_a+1] == "|") or\
(eon_a+2 < len(pattern) and pattern[eon_a+2] == "|"):
if pattern[eon_a+2] == "|":
anchor_order = symbol_to_order[pattern[eon_a+1]]
recipe = recipes[prev_node][0]
recipes[prev_node][0] = (recipe[0], recipe[1], anchor_order)
eon_a += 1
# If there is one we find the beginning
# of the next branch, residue or end of the string
# As before all characters inbetween are a number that
# is how often the branch is expanded.
eon_b = _find_next_character(pattern, ['[', ')', '(', '}'], eon_a+1)
next_characters = ['[', ')', '(', '}'] + list(symbol_to_order.keys())
eon_b = _find_next_character(pattern, next_characters, eon_a+1)
# the outermost loop goes over how often a the branch has to be
# added to the existing sequence
for idx in range(0,int(pattern[eon_a+2:eon_b])-1):
Expand Down Expand Up @@ -286,6 +326,13 @@ def read_cgsmiles(pattern):
prev_anchor = ref_anchor
# all branches added; then go back to the base anchor
prev_node = base_anchor
#================================================
# bond orders for after branches #
#================================================
if pattern[eon_b] in symbol_to_order:
prev_bond_order = symbol_to_order[pattern[eon_b]]
elif eon_a+1 < len(pattern) and pattern[eon_a+1] in symbol_to_order:
prev_bond_order = symbol_to_order[pattern[eon_a+1]]
# if all branches are done we need to reset the lists
# when all nested branches are completed
if len(branch_anchor) == 0:
Expand Down
10 changes: 10 additions & 0 deletions cgsmiles/resolve.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import re
import copy
import numpy as np
import networkx as nx
from .read_cgsmiles import read_cgsmiles
from .read_fragments import read_fragments
Expand Down Expand Up @@ -239,6 +240,15 @@ def resolve_disconnected_molecule(self, fragment_dict):
"""
for meta_node in self.meta_graph.nodes:
fragname = self.meta_graph.nodes[meta_node]['fragname']

# we have a virtual node and skip it
if fragname not in fragment_dict:
neighbors = self.meta_graph.neighbors(meta_node)
orders = [self.meta_graph.edges[(meta_node, neigh)]["order"] for neigh in neighbors]
if not all(np.array(orders) == 0):
raise SyntaxError(f"Found node #{fragname} but no corresponding fragment.")
continue

fragment = fragment_dict[fragname]
correspondence = merge_graphs(self.molecule, fragment)

Expand Down
2 changes: 1 addition & 1 deletion cgsmiles/sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ class MoleculeSampler:
descriptor is used to highlight the fact that the probabilistic behaviour is driven
by the bonding descriptors.
The sampler features a two level connectivity determination. First using the
The sampler features a two level connectivity determination. First using the
`polymer_reactivities` an open bonding descriptor from the growing polymer
molecule is selected according to the probabilities provided. Subsequently,
a fragment is randomly chosen that has a matching complementary bonding
Expand Down
75 changes: 73 additions & 2 deletions cgsmiles/tests/test_cgsmile_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
[(0, 1), (1, 2)],
[1, 1]),
# smiple linear sequenece with multi-edge
("{[#PMA]1[#PEO]1}",
("{[#PMA]=[#PEO]}",
["PMA", "PEO"],
None,
[(0, 1)],
Expand Down Expand Up @@ -46,6 +46,41 @@
None,
[(0, 1), (1, 2), (0, 2)],
[1, 1, 1]),
# smiple cycle sequence bond order to next
("{[#PMA]1=[#PEO][#PMA]1}",
["PMA", "PEO", "PMA"],
[(0, 1), (1, 2), (0, 2)],
[2, 1, 1]),
# smiple cycle sequence bond order in cycle
("{[#PMA]=1[#PEO][#PMA]1}",
["PMA", "PEO", "PMA"],
[(0, 1), (1, 2), (0, 2)],
[1, 1, 2]),
# smiple cycle sequence two bond orders
("{[#PMA].1=[#PEO][#PMA]1}",
["PMA", "PEO", "PMA"],
[(0, 1), (1, 2), (0, 2)],
[2, 1, 0]),
# smiple cycle sequence with % bond order
("{[#PMA]=%123[#PEO][#PMA]%123}",
["PMA", "PEO", "PMA"],
[(0, 1), (1, 2), (0, 2)],
[1, 1, 2]),
# smiple cycle mixed % and digit marker
("{[#PMA]=1[#PEO][#PMA]%01}",
["PMA", "PEO", "PMA"],
[(0, 1), (1, 2), (0, 2)],
[1, 1, 2]),
# smiple cycle sequence with % bond order next
("{[#PMA]%123=[#PEO][#PMA]%123}",
["PMA", "PEO", "PMA"],
[(0, 1), (1, 2), (0, 2)],
[2, 1, 1]),
# smiple cycle sequence with % two bond orders
("{[#PMA]=%123.[#PEO][#PMA]%123}",
["PMA", "PEO", "PMA"],
[(0, 1), (1, 2), (0, 2)],
[0, 1, 2]),
# smiple cycle sequence with %
("{[#PMA]%123[#PEO][#PMA]%123}",
["PMA", "PEO", "PMA"],
Expand All @@ -72,7 +107,7 @@
# [1, 1, 1, 1, 1, 1, 1, 1]),
# smiple linear sequenece with multi-edge
# in cycle
("{[#PMA]12[#PMA][#PMA][#PEO]12}",
("{[#PMA]=1[#PMA][#PMA][#PEO]1}",
["PMA", "PMA", "PMA", "PEO"],
None,
[(0, 1), (1, 2), (2, 3), (0, 3)],
Expand All @@ -87,6 +122,42 @@
(0, 4), (4, 5), (5, 6), (6, 7),
(4, 8), (8, 9), (9, 10), (10, 11)],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
# simple branch expension with bond orders
("{[#PMA]([#PEO][#PEO]=[#OHter])|3}",
["PMA", "PEO", "PEO", "OHter",
"PMA", "PEO", "PEO", "OHter",
"PMA", "PEO", "PEO", "OHter"],
[(0, 1), (1, 2), (2, 3),
(0, 4), (4, 5), (5, 6), (6, 7),
(4, 8), (8, 9), (9, 10), (10, 11)],
[1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2]),
# simple branch expension with bond orders
("{[#PMA].([#PEO][#PEO]=[#OHter])|3}",
["PMA", "PEO", "PEO", "OHter",
"PMA", "PEO", "PEO", "OHter",
"PMA", "PEO", "PEO", "OHter"],
[(0, 1), (1, 2), (2, 3),
(0, 4), (4, 5), (5, 6), (6, 7),
(4, 8), (8, 9), (9, 10), (10, 11)],
[0, 1, 2, 1, 0, 1, 2, 1, 0, 1, 2]),
# simple branch expension with bond orders
("{[#PMA]([#PEO][#PEO]=[#OHter])|3.[#E]}",
["PMA", "PEO", "PEO", "OHter",
"PMA", "PEO", "PEO", "OHter",
"PMA", "PEO", "PEO", "OHter", "E"],
[(0, 1), (1, 2), (2, 3),
(0, 4), (4, 5), (5, 6), (6, 7),
(4, 8), (8, 9), (9, 10), (10, 11), (8, 12)],
[1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 0]),
# not so simple branch expension with bond orders
("{[#PMA]([#PEO][#PEO]=[#OHter])$|3.[#E]}",
["PMA", "PEO", "PEO", "OHter",
"PMA", "PEO", "PEO", "OHter",
"PMA", "PEO", "PEO", "OHter", "E"],
[(0, 1), (1, 2), (2, 3),
(0, 4), (4, 5), (5, 6), (6, 7),
(4, 8), (8, 9), (9, 10), (10, 11), (8, 12)],
[1, 1, 2, 4, 1, 1, 2, 4, 1, 1, 2, 0]),
# nested branched with expansion
("{[#PMA]([#PEO]|3)|2}",
["PMA", "PEO", "PEO", "PEO",
Expand Down
Loading

0 comments on commit ad9844c

Please sign in to comment.