projwfc.x: parse from XML instead of parent calc (#747)

The current `ProjwfcParser` uses several in and output nodes from the parent calculation. This increased the complexity of the tests for this parser, and made running `opengrid.x` in between the `pw.x` and `projwfc.x` run impossible without adding these in and output nodes to the calculation job of `opengrid.x`. Here we switch to parsing the XML instead of relying on the parent calculation. The `data-file-schema.xml` of the parent calculation is retrieved and parsed, providing the required information for the subsequent parsing of the `projwfc.x` output. All the parsing tests are updated to include the XML output file and remove the in/output links for the parent calculation. Note that the XML file is added to the temporary retrieve list since although it is required for parsing, it is already in repository of a an ancestor calculation. The `convert_qe_to_kpoints` function is added to convert the k-points data in the XML to a `KpointsData` node.
aiidateam · Oct 8, 2021 · 0874d95 · 0874d95
1 parent e22fcba
commit 0874d95
Show file tree

Hide file tree

Showing 93 changed files with 9,401 additions and 3,036 deletions.
diff --git a/aiida_quantumespresso/calculations/namelists.py b/aiida_quantumespresso/calculations/namelists.py
@@ -31,11 +31,12 @@ class NamelistsCalculation(CalcJob):
     # parent_folder is of type RemoteData or FolderData
     _OUTPUT_SUBFOLDER = './out/'
     _PREFIX = 'aiida'
-    _internal_retrieve_list = []
     _default_namelists = ['INPUTPP']
     _blocked_keywords = []  # a list of tuples with key and value fixed
 
+    _internal_retrieve_list = []
     _retrieve_singlefile_list = []
+    _retrieve_temporary_list = []
 
     _DEFAULT_INPUT_FILE = 'aiida.in'
     _DEFAULT_OUTPUT_FILE = 'aiida.out'
@@ -218,10 +219,10 @@ def prepare_for_submission(self, folder):
         # Retrieve by default the output file and the xml file
         calcinfo.retrieve_list = []
         calcinfo.retrieve_list.append(self.inputs.metadata.options.output_filename)
-        settings_retrieve_list = settings.pop('ADDITIONAL_RETRIEVE_LIST', [])
-        calcinfo.retrieve_list += settings_retrieve_list
+        calcinfo.retrieve_list += settings.pop('ADDITIONAL_RETRIEVE_LIST', [])
         calcinfo.retrieve_list += self._internal_retrieve_list
 
+        calcinfo.retrieve_temporary_list = self._retrieve_temporary_list
         calcinfo.retrieve_singlefile_list = self._retrieve_singlefile_list
 
         # We might still have parser options in the settings dictionary: pop them.

diff --git a/aiida_quantumespresso/calculations/projwfc.py b/aiida_quantumespresso/calculations/projwfc.py
@@ -1,5 +1,7 @@
 # -*- coding: utf-8 -*-
 """`CalcJob` implementation for the projwfc.x code of Quantum ESPRESSO."""
+from pathlib import Path
+
 from aiida.orm import RemoteData, FolderData, Dict, XyData
 from aiida_quantumespresso.calculations.namelists import NamelistsCalculation
 
@@ -25,7 +27,17 @@ class ProjwfcCalculation(NamelistsCalculation):
         ('PROJWFC', 'plotboxes', False),
     ]
     _default_parser = 'quantumespresso.projwfc'
-    _internal_retrieve_list = [NamelistsCalculation._PREFIX + '.pdos*']
+
+    xml_path = Path(NamelistsCalculation._default_parent_output_folder
+                    ).joinpath(f'{NamelistsCalculation._PREFIX}.save', 'data-file-schema.xml')
+    _internal_retrieve_list = [
+        NamelistsCalculation._PREFIX + '.pdos*',
+    ]
+    # The XML file is added to the temporary retrieve list since it is required for parsing, but already in the
+    # repository of a an ancestor calculation.
+    _retrieve_temporary_list = [
+        xml_path.as_posix(),
+    ]
 
     @classmethod
     def define(cls, spec):
@@ -45,10 +57,20 @@ def define(cls, spec):
         spec.output('projections', valid_type=ProjectionData, required=False)
         spec.output('bands', valid_type=BandsData, required=False)
         spec.default_output_node = 'output_parameters'
+        spec.exit_code(301, 'ERROR_NO_RETRIEVED_TEMPORARY_FOLDER',
+            message='The retrieved temporary folder could not be accessed.')
+        spec.exit_code(303, 'ERROR_OUTPUT_XML_MISSING',
+            message='The retrieved folder did not contain the required XML file.')
         spec.exit_code(310, 'ERROR_OUTPUT_STDOUT_READ',
             message='The stdout output file could not be read.')
         spec.exit_code(312, 'ERROR_OUTPUT_STDOUT_INCOMPLETE',
             message='The stdout output file was incomplete probably because the calculation got interrupted.')
+        spec.exit_code(320, 'ERROR_OUTPUT_XML_READ',
+            message='The XML output file could not be read.')
+        spec.exit_code(321, 'ERROR_OUTPUT_XML_PARSE',
+            message='The XML output file could not be parsed.')
+        spec.exit_code(322, 'ERROR_OUTPUT_XML_FORMAT',
+            message='The XML output file has an unsupported format.')
         spec.exit_code(330, 'ERROR_READING_PDOSTOT_FILE',
             message='The pdos_tot file could not be read from the retrieved folder.')
         spec.exit_code(340, 'ERROR_PARSING_PROJECTIONS',

diff --git a/aiida_quantumespresso/calculations/pw.py b/aiida_quantumespresso/calculations/pw.py
@@ -89,7 +89,7 @@ def define(cls, spec):
         spec.exit_code(302, 'ERROR_OUTPUT_STDOUT_MISSING',
             message='The retrieved folder did not contain the required stdout output file.')
         spec.exit_code(303, 'ERROR_OUTPUT_XML_MISSING',
-            message='The retrieved folder did not contain the required required XML file.')
+            message='The retrieved folder did not contain the required XML file.')
         spec.exit_code(304, 'ERROR_OUTPUT_XML_MULTIPLE',
             message='The retrieved folder contained multiple XML files.')
         spec.exit_code(305, 'ERROR_OUTPUT_FILES',

diff --git a/aiida_quantumespresso/parsers/parse_raw/base.py b/aiida_quantumespresso/parsers/parse_raw/base.py
@@ -156,3 +156,30 @@ def convert_qe2aiida_structure(output_dict, input_structure=None):
         s.reset_sites_positions(new_pos)
 
     return s
+
+
+def convert_qe_to_kpoints(xml_dict, structure):
+    """Build the output kpoints from the raw parsed data.
+
+    :param parsed_parameters: the raw parsed data
+    :return: a `KpointsData` or None
+    """
+    from aiida.plugins import DataFactory
+
+    KpointsData = DataFactory('array.kpoints')
+
+    k_points_list = xml_dict.get('k_points', None)
+    k_points_units = xml_dict.get('k_points_units', None)
+    k_points_weights_list = xml_dict.get('k_points_weights', None)
+
+    if k_points_list is None or k_points_weights_list is None:
+        return None
+
+    if k_points_units != '1 / angstrom':
+        raise ValueError('k-points are not expressed in reciprocal cartesian coordinates')
+
+    kpoints = KpointsData()
+    kpoints.set_cell_from_structure(structure)
+    kpoints.set_kpoints(k_points_list, cartesian=True, weights=k_points_weights_list)
+
+    return kpoints
diff --git a/aiida_quantumespresso/parsers/projwfc.py b/aiida_quantumespresso/parsers/projwfc.py
@@ -1,15 +1,19 @@
 # -*- coding: utf-8 -*-
+from pathlib import Path
 import re
 import fnmatch
 
 import numpy as np
 
-from aiida.common import LinkType
-from aiida.orm import Dict, ProjectionData, BandsData, XyData, CalcJobNode
+from aiida.orm import Dict, ProjectionData, BandsData, XyData
 from aiida.plugins import OrbitalFactory
 
 from aiida_quantumespresso.parsers import QEOutputParsingError
-from aiida_quantumespresso.parsers.parse_raw.base import parse_output_base
+from aiida_quantumespresso.parsers.parse_raw.base import (
+    parse_output_base, convert_qe2aiida_structure, convert_qe_to_kpoints
+)
+from aiida_quantumespresso.utils.mapping import get_logging_container
+
 from .base import Parser
 
 
@@ -168,16 +172,11 @@ def spin_dependent_subparser(out_info_dict):
         raise QEOutputParsingError('the standard out file does not comply with the official documentation.')
 
     bands_data = BandsData()
-    # Attempts to retrieve the kpoints from the parent calc
-    parent_calc = out_info_dict['parent_calc']
-    try:
-        parent_kpoints = parent_calc.get_incoming(link_label_filter='kpoints').one().node
-    except ValueError:
-        raise QEOutputParsingError('The parent had no input kpoints! Cannot parse from this!')
+    kpoints = od['kpoints']
     try:
-        if len(od['k_vect']) != len(parent_kpoints.get_kpoints()):
+        if len(od['k_vect']) != len(kpoints.get_kpoints()):
             raise AttributeError
-        bands_data.set_kpointsdata(parent_kpoints)
+        bands_data.set_kpointsdata(kpoints)
     except AttributeError:
         bands_data.set_kpoints(od['k_vect'].astype(float))
 
@@ -284,8 +283,12 @@ def parse(self, **kwargs):
 
         Retrieves projwfc output, and some basic information from the out_file, such as warnings and wall_time
         """
-        # Check that the retrieved folder is there
         retrieved = self.retrieved
+        # Get the temporary retrieved folder
+        try:
+            retrieved_temporary_folder = kwargs['retrieved_temporary_folder']
+        except KeyError:
+            return self.exit(self.exit_codes.ERROR_NO_RETRIEVED_TEMPORARY_FOLDER)
 
         # Read standard out
         try:
@@ -309,6 +312,24 @@ def parse(self, **kwargs):
         self.emit_logs(logs)
         self.out('output_parameters', Dict(dict=parsed_data))
 
+        # Parse the XML to obtain the `structure`, `kpoints` and spin-related settings from the parent calculation
+        self.exit_code_xml = None
+        parsed_xml, logs_xml = self._parse_xml(retrieved_temporary_folder)
+        self.emit_logs(logs_xml)
+
+        if self.exit_code_xml:
+            return self.exit(self.exit_code_xml)
+
+        # we create a dictionary the progressively accumulates more info
+        out_info_dict = {}
+
+        out_info_dict['structure'] = convert_qe2aiida_structure(parsed_xml['structure'])
+        out_info_dict['kpoints'] = convert_qe_to_kpoints(parsed_xml, out_info_dict['structure'])
+        out_info_dict['nspin'] = parsed_xml.get('number_of_spin_components')
+        out_info_dict['collinear'] = not parsed_xml.get('non_colinear_calculation')
+        out_info_dict['spinorbit'] = parsed_xml.get('spin_orbit_calculation')
+        out_info_dict['spin'] = out_info_dict['nspin'] == 2
+
         # check and read pdos_tot file
         out_filenames = retrieved.list_object_names()
         try:
@@ -329,8 +350,6 @@ def parse(self, **kwargs):
                 pdos_atm_array_dict[name] = np.atleast_2d(np.genfromtxt(pdosatm_file))
 
         # finding the bands and projections
-        # we create a dictionary the progressively accumulates more info
-        out_info_dict = {}
         out_info_dict['out_file'] = out_file
         out_info_dict['energy'] = energy
         out_info_dict['pdos_atm_array_dict'] = pdos_atm_array_dict
@@ -347,6 +366,37 @@ def parse(self, **kwargs):
         Dos_out.set_y(dos, 'Dos', 'states/eV')
         self.out('Dos', Dos_out)
 
+    def _parse_xml(self, retrieved_temporary_folder):
+        """Parse the XML file.
+
+        The XML must be parsed in order to obtain the required information for the orbital parsing.
+        """
+        from .parse_xml.exceptions import XMLParseError, XMLUnsupportedFormatError
+        from .parse_xml.pw.parse import parse_xml
+
+        logs = get_logging_container()
+        parsed_xml = {}
+
+        xml_filepath = Path(retrieved_temporary_folder) / self.node.process_class.xml_path.name
+
+        if not xml_filepath.exists():
+            self.exit_code_xml = self.exit_codes.ERROR_OUTPUT_XML_MISSING
+            return parsed_xml, logs
+
+        try:
+            with xml_filepath.open('r') as handle:
+                parsed_xml, logs = parse_xml(handle, None)
+        except IOError:
+            self.exit_code_xml = self.exit_codes.ERROR_OUTPUT_XML_READ
+        except XMLParseError:
+            self.exit_code_xml = self.exit_codes.ERROR_OUTPUT_XML_PARSE
+        except XMLUnsupportedFormatError:
+            self.exit_code_xml = self.exit_codes.ERROR_OUTPUT_XML_FORMAT
+        except Exception:
+            self.exit_code_xml = self.exit_codes.ERROR_UNEXPECTED_PARSER_EXCEPTION
+
+        return parsed_xml, logs
+
     def _parse_bands_and_projections(self, out_info_dict):
         """Function that parses the standard output into bands and projection data.
 
@@ -382,57 +432,21 @@ def _parse_bands_and_projections(self, out_info_dict):
         # calculates the number of bands
         out_info_dict['num_bands'] = len(out_info_dict['psi_lines']) // len(out_info_dict['k_lines'])
 
-        # Uses the parent input parameters, and checks if the parent used
-        # spin calculations. Try to replace with a query, if possible.
-        try:
-            parent_calc = (
-                self.node.inputs.parent_folder.get_incoming(node_class=CalcJobNode,
-                                                            link_type=LinkType.CREATE).one().node
-            )
-        except ValueError as e:
-            raise QEOutputParsingError(f'Could not get parent calculation of input folder: {e}')
-        out_info_dict['parent_calc'] = parent_calc
-        try:
-            parent_param = parent_calc.get_outgoing(link_label_filter='output_parameters').one().node
-        except ValueError:
-            raise QEOutputParsingError('The parent had no output_parameters! Cannot parse from this!')
-        try:
-            structure = parent_calc.get_incoming(link_label_filter='structure').one().node
-        except ValueError:
-            raise QEOutputParsingError('The parent had no input structure! Cannot parse from this!')
-        try:
-            nspin = parent_param.get_dict()['number_of_spin_components']
-            if nspin != 1:
-                spin = True
-            else:
-                spin = False
-            out_info_dict['spinorbit'] = parent_param.get_dict().get('spin_orbit_calculation', False)
-            out_info_dict['collinear'] = not parent_param.get_dict().get('non_colinear_calculation', False)
-            if not out_info_dict['collinear']:
-                # Sanity check
-                if nspin != 4:
-                    raise QEOutputParsingError('The calculation is non-collinear, but nspin is not set to 4!')
-                spin = False
-        except KeyError:
-            spin = False
-            out_info_dict['spinorbit'] = False
-            out_info_dict['collinear'] = True
-        out_info_dict['spin'] = spin
-
         # changes k-numbers to match spin
         # because if spin is on, k points double for up and down
         out_info_dict['k_states'] = len(out_info_dict['k_lines'])
-        if spin:
+        if out_info_dict['spin']:
             if out_info_dict['k_states'] % 2 != 0:
                 raise QEOutputParsingError('Internal formatting error regarding spin')
             out_info_dict['k_states'] = out_info_dict['k_states'] // 2
 
-        #   adds in the k-vector for each kpoint
+        # adds in the k-vector for each kpoint
         k_vect = [out_file[out_info_dict['k_lines'][i]].split()[2:] for i in range(out_info_dict['k_states'])]
         out_info_dict['k_vect'] = np.array(k_vect)
-        out_info_dict['structure'] = structure
         out_info_dict['orbitals'] = find_orbitals_from_statelines(out_info_dict)
 
+        spin = out_info_dict['spin']
+
         if spin:
             # I had to guess what the ordering of the spin is, because
             # the projwfc.x documentation doesn't say, but looking at the

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -265,7 +265,10 @@ def _generate_calc_job_node(
         if retrieve_temporary:
             dirpath, filenames = retrieve_temporary
             for filename in filenames:
-                shutil.copy(os.path.join(filepath_folder, filename), os.path.join(dirpath, filename))
+                try:
+                    shutil.copy(os.path.join(filepath_folder, filename), os.path.join(dirpath, filename))
+                except FileNotFoundError:
+                    pass  # To test the absence of files in the retrieve_temporary folder
 
         if filepath_folder:
             retrieved = orm.FolderData()
@@ -274,7 +277,10 @@ def _generate_calc_job_node(
             # Remove files that are supposed to be only present in the retrieved temporary folder
             if retrieve_temporary:
                 for filename in filenames:
-                    retrieved.delete_object(filename)
+                    try:
+                        retrieved.delete_object(filename)
+                    except OSError:
+                        pass  # To test the absence of files in the retrieve_temporary folder
 
             retrieved.add_incoming(node, link_type=LinkType.CREATE, link_label='retrieved')
             retrieved.store()