diff --git a/setup.cfg b/setup.cfg index 450fb455c..6581337b8 100644 --- a/setup.cfg +++ b/setup.cfg @@ -34,6 +34,7 @@ install-requires = # ?? requires-dist? pbr numpy networkx ~= 2.0 + lark zip-safe = False [options.extras_require] diff --git a/vermouth/__init__.py b/vermouth/__init__.py index 1d57c03d3..694b89f09 100644 --- a/vermouth/__init__.py +++ b/vermouth/__init__.py @@ -32,16 +32,17 @@ # Find the data directory once. +from pathlib import Path try: import pkg_resources except ImportError: import os - DATA_PATH = os.path.join(os.path.dirname(__file__), 'data') + DATA_PATH = Path(os.path.dirname(__file__)) / 'data' del os else: - DATA_PATH = pkg_resources.resource_filename('vermouth', 'data') + DATA_PATH = Path(pkg_resources.resource_filename('vermouth', 'data')) del pkg_resources - +del Path del pbr try: diff --git a/vermouth/data/grammars/gmx_rtp.lark b/vermouth/data/grammars/gmx_rtp.lark new file mode 100644 index 000000000..1de98b49a --- /dev/null +++ b/vermouth/data/grammars/gmx_rtp.lark @@ -0,0 +1,37 @@ +_header{name}: "[" name "]" _NL+ + +start: _NL* bondedtypes? residue+ +bondedtypes: _header{"bondedtypes"} INT~8 _NL* + +residue: _header{NAME} mol_section* + +?mol_section: atomsection + | _header{"bonds"} bond+ -> bonds + | _header{"angles"} angle+ -> angles + | _header{"dihedrals"} dihedral+ -> dihedrals + | _header{"impropers"} dihedral+ -> impropers + | _header{"cmap"} cmap+ -> cmaps + | _header{"exclusions"} bond+ -> exclusions + +name_line: NAME INT _NL* + +atomsection: _header{"atoms"} atom+ +atom: NAME NAME SIGNED_NUMBER INT _NL* + +INTERACTION_ATOM: ("+"+|"-"+)? NAME +bond: INTERACTION_ATOM~2 [parameters] _NL+ +angle: INTERACTION_ATOM~3 [parameters] _NL+ +dihedral: INTERACTION_ATOM~4 [parameters] _NL+ +cmap: INTERACTION_ATOM~5 [parameters] _NL+ + +parameters: (NAME|SIGNED_NUMBER+) + +NAME: /\w/+ +VALUE: /./+ +COMMENT: /;[^\n]*/ +%ignore COMMENT + +%import common (INT, SIGNED_NUMBER) +%import common.NEWLINE -> _NL +%import common.WS_INLINE +%ignore WS_INLINE diff --git a/vermouth/gmx/rtp.py b/vermouth/gmx/rtp.py index b4d4e92ed..1b8c1bea5 100644 --- a/vermouth/gmx/rtp.py +++ b/vermouth/gmx/rtp.py @@ -1,4 +1,4 @@ -# Copyright 2018 University of Groningen +# Copyright 2020 University of Groningen # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,225 +16,188 @@ Handle the RTP format from Gromacs. """ -import collections -import itertools -import networkx as nx +from collections import defaultdict, namedtuple +from itertools import groupby +import lark +from .. import DATA_PATH from ..molecule import Block, Link, Interaction -from .. import utils +from ..utils import first_alpha + __all__ = ['read_rtp'] -# Name of the subsections in RTP files. -# Names starting with a '_' are for internal use. -RTP_SUBSECTIONS = ('atoms', 'bonds', 'angles', 'dihedrals', - 'impropers', 'cmap', 'exclusions', - '_bondedtypes') + +GRAMMAR_FILE = DATA_PATH / 'grammars' / 'gmx_rtp.lark' +with open(GRAMMAR_FILE) as file: + GRAMMAR = file.read() -_BondedTypes = collections.namedtuple( +_BondedTypes = namedtuple( '_BondedTypes', - 'bonds angles dihedrals impropers all_dihedrals nrexcl HH14 remove_dih' + 'bonds angles dihedrals impropers all_dihedrals nrexcl HH14 remove_dih cmap exclusions' ) -class _IterRTPSubsectionLines: - """ - Iterate over the lines of an RTP file within a subsection. - """ +class Transformer(lark.visitors.Transformer_InPlace): + def INT(self, tok): + return tok.update(value=int(tok)) + + def SIGNED_NUMBER(self, tok): + return tok.update(value=float(tok)) + + def PARAMETER(self, tok): + return tok.value + + def atom(self, tokens): + tokens = (tok.value for tok in tokens) + return dict(zip(['atomname', 'atype', 'charge', 'charge_group'], tokens)) + + +class RTPInterpreter(lark.visitors.Interpreter): + ATOMS_PER_INTERACTION = {'bonds': 2, 'angles': 3, 'dihedrals': 4, 'impropers': 4, 'cmap': 5, 'exclusions': 2} + def __init__(self, *args, force_field=None, **kwargs): + super().__init__(*args, **kwargs) + self.bonded_types = [] + self.block = None + self.force_field = force_field + self.links = [] + + def bondedtypes(self, tree): + # Default taken from + # 'src/gromacs/gmxpreprocess/resall.cpp::read_resall' in the Gromacs + # source code. + defaults = 1, 1, 1, 1, 0, 3, 1, 1 + read = [tok.value for tok in tree.children] + self.bonded_types = _BondedTypes(*(read + list(defaults[len(read):])), cmap=1, exclusions=1) + + def _count_hydrogens(self, idxs): + names = [self.block.nodes[idx].get('atomname') for idx in idxs] + return len([name for name in names if first_alpha(name) == 'H']) + + def _finish_block(self): + self.block.make_edges_from_interactions() + + # Add all possible angles + for atoms in self.block.guess_angles(): + self.block.add_interaction('angles', atoms=atoms, parameters=[self.bonded_types.angles]) + + # Generate missing dihedrals + # As pdb2gmx generates all the possible dihedral angles by default, + # RTP files are written assuming they will be generated. A RTP file + # have some control over these dihedral angles through the bondedtypes + # section. + all_dihedrals = [] + for center, dihedrals in groupby( + sorted(self.block.guess_dihedrals(), key=_dihedral_sorted_center), + _dihedral_sorted_center): + if _keep_dihedral(center, self.block, self.bonded_types): + # See src/gromacs/gmxpreprocess/gen_add.cpp::dcomp in the + # Gromacs source code (see version 2016.3 for instance). + # We only keep the one with 1) fewest hydrogens, and 2) lowest + # indices + atoms = min(dihedrals, key=lambda idxs: (self._count_hydrogens(idxs), idxs)) + all_dihedrals.append(Interaction(atoms=atoms, parameters=[self.bonded_types.dihedrals], meta={})) + self.block.interactions['dihedrals'] = sorted( + self.block.interactions.get('dihedrals', []) + all_dihedrals, + key=lambda inter: inter.atoms + ) + + # TODO: generate 14 pairs, exclusions, ... + + # Split off the links. We usually generate 3 links, one -, one +, and + # possibly one +-. This is because of terminal residues. + link_groups = defaultdict(set) + for interactions in self.block.interactions.values(): + for interaction in interactions: + orders = frozenset(self.block.nodes[idx]['atomname'][0] + for idx in interaction.atoms if + self.block.nodes[idx]['atomname'][0] in '+-') + if orders: + link_groups[orders].update(interaction.atoms) + + for link_nodes in link_groups.values(): + tmp_link = self.block.subgraph(link_nodes) + link = Link(tmp_link) + link.interactions = tmp_link.interactions + for n_idx in link.nodes: + # Keep only desired node attributes + node = link.nodes[n_idx] + for attr in list(node.keys()): + if attr not in ('atomname', 'resname'): + del node[attr] + if node['atomname'][0] in '+-': + order, node['atomname'] = node['atomname'][0], node['atomname'][1:] + node['order'] = 1 if order == '+' else -1 + self.links.append(link) + + # Remove the foreign nodes from the block + for n_idx in list(self.block.nodes): + if self.block.nodes[n_idx]['atomname'][0] in '+-': + self.block.remove_node(n_idx) + + def residue(self, tree): + name = tree.children[0] + self.block = Block(name=name.value, nrexcl=self.bonded_types.nrexcl, force_field=self.force_field) + self.visit_children(tree) # Interpret the children, i.e. atoms and interactions + self._finish_block() + + self.force_field.blocks[name] = self.block + self.force_field.links.extend(self.links) + self.links = [] + + def atomsection(self, tree): + self.block.add_nodes_from(enumerate(tree.children)) + for node_idx in self.block: + self.block.nodes[node_idx]['resname'] = self.block.name + + def _interaction(self, tree, type_): + num_atoms = self.ATOMS_PER_INTERACTION[type_] + atoms, parameters = tree.children[:num_atoms], tree.children[num_atoms] + params = parameters.children if parameters else [getattr(self.bonded_types, type_)] + atids = [] + for atom in atoms: + if atom.value[0] in ('+-'): + # We need to add the "link" atom/node so we can find edges based on + # the link interactions. We separate these atoms and interactions + # into Links later. + try: + atid = next(self.block.find_atoms(atomname=atom.value)) + except StopIteration: + self.block.add_node(len(self.block), atomname=atom.value) + atid = len(self.block) - 1 + else: + try: + atid = next(self.block.find_atoms(atomname=atom.value)) + except StopIteration as err: + raise KeyError('No atom with name {} is found in residue {}' + ''.format(atom.value, self.block.name)) from err + atids.append(atid) + self.block.add_interaction(type_, atoms=atids, parameters=params) - def __init__(self, parent): - self.parent = parent - self.lines = parent.lines - self.running = True - - def __next__(self): - if not self.running: - raise StopIteration - line = next(self.lines) - if line.strip().startswith('['): - self.parent.buffer.append(line) - self.running = False - raise StopIteration - return line - - def __iter__(self): - return self - - def flush(self): - """ - Move the iterator after the last line of the subsection. - """ - for _ in self: - pass - - -class _IterRTPSubsections: - """ - Iterate over the subsection of a RTP file within a section. + def bonds(self, tree): + for child in tree.children: + self._interaction(child, "bonds") - For each subsections, yields its name and an iterator over its lines. - """ + def angles(self, tree): + for child in tree.children: + self._interaction(child, "angles") - def __init__(self, parent): - self.parent = parent - self.lines = parent.lines - self.buffer = collections.deque() - self.current_subsection = None - self.running = True - - def __next__(self): - if not self.running: - raise StopIteration - if self.current_subsection is not None: - self.current_subsection.flush() - if self.buffer: - line = self.buffer.popleft() - else: - line = next(self.lines) - stripped = line.strip() - if stripped.startswith('['): - # A section header looks like "[ name ]". It matches the following - # regexp: r"^\s*\[\s*(?P)\s*\]\s*$". Once stripped, the - # trailing spaces at the beginning and at the end are removed so - # the string starts with '[' and ends with ']'. The slicing remove - # these brackets. The final call to strip remove the potential - # white characters between the brackets and the section name. - name = stripped[1:-1].strip() - if name in RTP_SUBSECTIONS: - subsection = _IterRTPSubsectionLines(self) - self.current_subsection = subsection - return name, subsection - self.parent.buffer.append(line) - self.running = False - raise StopIteration - raise IOError('I am almost sure I should not be here...') - - def __iter__(self): - return self - - def flush(self): - """ - Move the iterator after the last subsection of the section. - """ - for _ in self: - pass - - -class _IterRTPSections: - """ - Iterate over the sections of a RTP file. + def dihedrals(self, tree): + for child in tree.children: + self._interaction(child, "dihedrals") - For each section, yields the name of the sections and an iterator over its - subsections. - """ + def impropers(self, tree): + for child in tree.children: + self._interaction(child, "impropers") + + def cmaps(self, tree): + for child in tree.children: + self._interaction(child, "cmap") - def __init__(self, lines): - self.lines = lines - self.buffer = collections.deque() - self.current_section = None - - def __next__(self): - if self.current_section is not None: - self.current_section.flush() - if self.buffer: - line = self.buffer.popleft() - else: - line = next(self.lines) - stripped = line.strip() - if stripped.startswith('['): - name = stripped[1:-1].strip() - section = _IterRTPSubsections(self) - self.current_section = section - # The "[ bondedtypes ]" is special in the sense that it does - # not have subsection, but instead have its content directly - # under it. This breaks the neat hierarchy the rest of the file - # has. Here we restore the hierarchy by faking that the file - # contains: - # - # [ bondedtypes ] - # [ _bondedtypes ] - # - # For now, I do that on the basis of the section name. If other - # sections are special that way, I'll detect them with a look - # ahead. - if name == 'bondedtypes': - section.buffer.append(' [ _bondedtypes ]') - return name, section - raise IOError('Hum... There is a bug in the RTP reader.') - - def __iter__(self): - return self - - -def _atoms(subsection, block): - for line in subsection: - name, atype, charge, charge_group = line.split() - atom = { - 'atomname': name, - 'atype': atype, - 'charge': float(charge), - 'charge_group': int(charge_group), - } - block.add_atom(atom) - - -def _base_rtp_parser(interaction_name, natoms): - def wrapped(subsection, block): - """ - Parse the lines from a RTP subsection and populate the block. - """ - interactions = [] - for line in subsection: - splitted = line.strip().split() - atoms = splitted[:natoms] - parameters = splitted[natoms:] - interactions.append(Interaction(atoms=atoms, - parameters=parameters, - meta={})) - block.interactions[interaction_name] = interactions - return wrapped - - -def _parse_bondedtypes(section): - # Default taken from - # 'src/gromacs/gmxpreprocess/resall.cpp::read_resall' in the Gromacs - # source code. - defaults = _BondedTypes(bonds=1, angles=1, dihedrals=1, - impropers=1, all_dihedrals=0, - nrexcl=3, HH14=1, remove_dih=1) - - # The 'bondedtypes' section contains its line directly under it. In - # order to match the hierarchy model of the rest of the file, the - # iterator actually yields a subsection named '_bondedtypes'. We need - # to read the fist line of that first virtual subsection. - _, lines = next(section) - line = next(lines) - read = [int(x) for x in line.split()] - - # Fill with the defaults. The file gives the values in order so we - # need to append the missing values from the default at the end. - bondedtypes = _BondedTypes(*(read + list(defaults[len(read):]))) - - # Make sure there is no unexpected lines in the section. - # Come on Jonathan! There must be a more compact way of doing it. - try: - next(lines) - except StopIteration: - pass - else: - raise IOError('"[ bondedtypes ]" section is missformated.') - try: - next(section) - except StopIteration: - pass - else: - raise IOError('"[ bondedtypes ]" section is missformated.') - return bondedtypes - - -def _count_hydrogens(names): - return len([name for name in names if utils.first_alpha(name) == 'H']) + def exclusions(self, tree): + for child in tree.children: + self._interaction(child, "exclusions") def _keep_dihedral(center, block, bondedtypes): @@ -245,223 +208,6 @@ def _keep_dihedral(center, block, bondedtypes): return True -def _complete_block(block, bondedtypes): - """ - Add information from the bondedtypes section to a block. - - Generate implicit dihedral angles, and add function types to the - interactions. - """ - block.make_edges_from_interactions() - - # Generate missing dihedrals - # As pdb2gmx generates all the possible dihedral angles by default, - # RTP files are written assuming they will be generated. A RTP file - # have some control over these dihedral angles through the bondedtypes - # section. - all_dihedrals = [] - for center, dihedrals in itertools.groupby( - sorted(block.guess_dihedrals(), key=_dihedral_sorted_center), - _dihedral_sorted_center): - if _keep_dihedral(center, block, bondedtypes): - # TODO: Also sort the dihedrals by index. - # See src/gromacs/gmxpreprocess/gen_add.cpp::dcomp in the - # Gromacs source code (see version 2016.3 for instance). - atoms = sorted(dihedrals, key=_count_hydrogens)[0] - all_dihedrals.append(Interaction(atoms=atoms, parameters=[], meta={})) - # TODO: Sort the dihedrals by index - block.interactions['dihedrals'] = ( - block.interactions.get('dihedrals', []) + all_dihedrals - ) - - # TODO: generate 1-4 interactions between pairs of hydrogen atoms - - # Add function types to the interaction parameters. This is done as a - # post processing step to cluster as much interaction specific code - # into this method. - # I am not sure the function type can be set explicitly in the RTP - # file except through the bondedtypes section. If it is possible, then the - # following code can break and atoms can have the function type written - # twice. Yet, none of the RTP files distributed with Gromacs 2016.3 causes - # issue. - functypes = { - 'bonds': bondedtypes.bonds, - 'angles': bondedtypes.angles, - 'dihedrals': bondedtypes.dihedrals, - 'impropers': bondedtypes.impropers, - 'exclusions': 1, - 'cmap': 1, - } - for name, interactions in block.interactions.items(): - for interaction in interactions: - interaction.parameters.insert(0, functypes[name]) - - # Set the nrexcl to the block. - block.nrexcl = bondedtypes.nrexcl - - -def _split_blocks_and_links(pre_blocks): - """ - Split all the pre-blocks from `pre_block` into blocks and links. - - Parameters - ---------- - pre_blocks: dict - A dict with residue names as keys and instances of :class:`Block` - as values. - - Returns - ------- - blocks: dict - A dict like `pre_block` with all inter-residues information - stripped out. - links: list - A list of instances of :class:`Link` containing the inter-residues - information from `pre_blocks`. - - See Also - -------- - _split_block_and_link - Split an individual pre-block into a block and a link. - """ - blocks = {} - links = [] - for name, pre_block in pre_blocks.items(): - block, link = _split_block_and_link(pre_block) - blocks[name] = block - links.append(link) - return blocks, links - - -def _split_block_and_link(pre_block): - """ - Split `pre_block` into a block and a link. - - A pre-block is a block as read from an RTP file. It contains both - intra-residue and inter-residues information. This method split this - information so that the intra-residue interactions are put in a block - and the inter-residues ones are put in a link. - - Parameters - ---------- - pre_block: Block - The block to split. - - Returns - ------- - block: Block - All the intra-residue information. - link: Link - All the inter-residues information. - """ - block = Block(force_field=pre_block.force_field) - link = Link() - - # It is easier to fill the interactions using a defaultdict, - # yet defaultdicts are more annoying when reading as querying them - # creates the keys. So the interactions are revert back to regular - # dict at the end. - block.interactions = collections.defaultdict(list) - link.interactions = collections.defaultdict(list) - - block.name = pre_block.name - try: - block.nrexcl = pre_block.nrexcl - except AttributeError: - pass - - # Filter the particles from neighboring residues out of the block. - for atom in pre_block.atoms: - if not atom['atomname'].startswith('+-'): - atom['resname'] = pre_block.name - block.add_atom(atom) - link.add_node(atom['atomname']) - - # Create the edges of the link and block based on the edges in the pre-block. - # This will create too many edges in the link, but the useless ones will be - # pruned latter. - link.add_edges_from(pre_block.edges) - block.add_edges_from(edge for edge in pre_block.edges - if not any(node[0] in '+-' for node in edge)) - - # Split the interactions from the pre-block between the block (for - # intra-residue interactions) and the link (for inter-residues ones). - # The "relevant_atoms" set keeps track of what particles are - # involved in the link. This will allow to prune the link without - # iterating again through its interactions. - relevant_atoms = set() - for name, interactions in pre_block.interactions.items(): - for interaction in interactions: - for_link = any(atom[0] in '+-' for atom in interaction.atoms) - if for_link: - link.interactions[name].append(interaction) - relevant_atoms.update(interaction.atoms) - else: - block.interactions[name].append(interaction) - - # Prune the link to keep only the edges and particles that are - # relevant. - nodes = set(link.nodes()) - link.remove_nodes_from(nodes - relevant_atoms) - # Some interactions do not generate nodes (impropers for instance). If a - # node is only described in one of such interactions, then the node never - # appears in the link. Here we make sure these nodes exists even if they - # are note connected. - link.add_nodes_from(relevant_atoms) - - # Atoms from a links are matched against a molecule based on its node - # attributes. The name is a primary criterion, but other criteria can be - # the residue name (`resname` key) or the residue order in the chain - # (`order` key). The residue order is 0 when refering to the current - # residue, +int to refer to residues after the current one in the sequence, - # -int to refer to a previous residue in the sequence, and '*' for any - # residue but the current one. - # RTP files convey the order by prefixing the names with + or -. We need to - # get rid of these prefixes. - order = {'+': +1, '-': -1} - relabel_mapping = {} - for idx, node in enumerate(link.nodes()): - atomname = node - if node[0] in '+-': - link.nodes[node]['order'] = order[node[0]] - atomname = atomname[1:] - else: - link.nodes[node]['order'] = 0 - link.nodes[node]['resname'] = block.name - link.nodes[node]['atomname'] = atomname - relabel_mapping[node] = idx - nx.relabel_nodes(link, relabel_mapping, copy=False) - - # By relabelling the nodes, we lost the relations between interactions and - # nodes, so we need to relabel the atoms in the interactions - new_interactions = collections.defaultdict(list) - for name, interactions in link.interactions.items(): - for interaction in interactions: - atoms = tuple(relabel_mapping[atom] for atom in interaction.atoms) - new_interactions[name].append(Interaction( - atoms=atoms, - parameters=interaction.parameters, - meta=interaction.meta - )) - link.interactions = new_interactions - - - # Revert the interactions back to regular dicts to avoid creating - # keys when querying them. - block.interactions = dict(block.interactions) - link.interactions = dict(link.interactions) - - return block, link - - -def _clean_lines(lines): - # TODO: merge continuation lines - for line in lines: - splitted = line.split(';', 1) - if splitted[0].strip(): - yield splitted[0] - - def _dihedral_sorted_center(atoms): #return sorted(atoms[1:-1]) return atoms[1:-1] @@ -484,41 +230,8 @@ def read_rtp(lines, force_field): IOError Something in the file could not be parsed. """ - _subsection_parsers = { - 'atoms': _atoms, - 'bonds': _base_rtp_parser('bonds', natoms=2), - 'angles': _base_rtp_parser('angles', natoms=3), - 'impropers': _base_rtp_parser('impropers', natoms=4), - 'cmap': _base_rtp_parser('cmap', natoms=5), - } - # An RTP file contains both the blocks and the links. - # We first read everything in "pre-blocks"; then we separate the - # blocks from the links. - pre_blocks = {} - bondedtypes = None - cleaned = _clean_lines(lines) - for section_name, section in _IterRTPSections(cleaned): - if section_name == 'bondedtypes': - bondedtypes = _parse_bondedtypes(section) - continue - block = Block(force_field=force_field) - pre_blocks[section_name] = block - block.name = section_name - - for subsection_name, subsection in section: - if subsection_name in _subsection_parsers: - _subsection_parsers[subsection_name](subsection, block) - - # Pre-blocks only contain the interactions that are explicitly - # written in the file. Some are incomplete (missing implicit defaults) - # or must be built from the "bondedtypes" rules. - for pre_block in pre_blocks.values(): - _complete_block(pre_block, bondedtypes) - - # At this point, the pre-blocks contain both the intra- and - # inter-residues information. We need to split the pre-blocks into - # blocks and links. - blocks, links = _split_blocks_and_links(pre_blocks) - - force_field.blocks.update(blocks) - force_field.links.extend(links) + parser = lark.Lark(GRAMMAR, maybe_placeholders=True) + parsed = Transformer().transform(parser.parse(lines.read())) + + visitor = RTPInterpreter(force_field=force_field) + visitor.visit(parsed) diff --git a/vermouth/molecule.py b/vermouth/molecule.py index e71086102..fb52534f4 100644 --- a/vermouth/molecule.py +++ b/vermouth/molecule.py @@ -364,7 +364,7 @@ def sort_interactions(all_interactions): return sorted(sort_keys, key=lambda k: sort_keys[k]) def __str__(self): - moltype = self.meta.get('moltype', 'molecule') + moltype = self.meta.get('moltype', self.__class__.__name__.lower()) interaction_count = OrderedDict() # Make sure atoms and edges get sorted first.