From 92eabb1a77967439a41bc54ae17a0f26a2259067 Mon Sep 17 00:00:00 2001
From: Alex Ioannidis <a.ioannidis@cern.ch>
Date: Mon, 25 Nov 2024 11:06:20 +0100
Subject: [PATCH] names: support mapping affiliation IDs

* Allows providing a CSV file to the ORCiD transformer for mapping
  organization identifiers enountered in the ORCiD data to linkable
  affiliation IDs.
---
 invenio_vocabularies/config.py                |  10 +
 .../contrib/names/datastreams.py              | 121 +++++--
 tests/contrib/names/test_names_datastreams.py | 298 +++++++++++++-----
 tests/datastreams/test_transformers.py        |  83 +++--
 4 files changed, 382 insertions(+), 130 deletions(-)

diff --git a/invenio_vocabularies/config.py b/invenio_vocabularies/config.py
index 90a11151..417ed8db 100644
--- a/invenio_vocabularies/config.py
+++ b/invenio_vocabularies/config.py
@@ -213,3 +213,13 @@ def is_edmo(val):
     "days": 1,
 }
 """ORCID time shift to sync. Parameters accepted are the ones passed to 'datetime.timedelta'."""
+
+VOCABULARIES_ORCID_ORG_IDS_MAPPING_PATH = None
+"""Path to the CSV file for mapping ORCiD organization IDs to affiliation IDs.
+
+The CSV file should have the following columns:
+
+- `org_scheme`: The ORCiD organization ID.
+- `org_id`: The ORCiD organization ID.
+- `aff_id`: The affiliation ID to map to.
+"""
diff --git a/invenio_vocabularies/contrib/names/datastreams.py b/invenio_vocabularies/contrib/names/datastreams.py
index 8299336e..267116d9 100644
--- a/invenio_vocabularies/contrib/names/datastreams.py
+++ b/invenio_vocabularies/contrib/names/datastreams.py
@@ -13,11 +13,13 @@
 import tarfile
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from datetime import timedelta
+from pathlib import Path
 
 import arrow
 import regex as re
 from flask import current_app
 from invenio_access.permissions import system_identity
+from werkzeug.utils import cached_property
 
 from invenio_vocabularies.contrib.names.s3client import S3OrcidClient
 
@@ -141,6 +143,52 @@ def __init__(self, *args, test_mode=True, **kwargs):
 """Regex to filter out names with punctuation, symbols, numbers and emojis."""
 
 
+class OrcidOrgToAffiliationMapper:
+    """Default ORCiD Org ID to affiliation ID mapper."""
+
+    def __init__(self, org_ids_mapping=None, org_ids_mapping_file=None):
+        """Constructor."""
+        self._org_ids_mapping = org_ids_mapping
+        self._org_ids_mapping_file = org_ids_mapping_file
+
+    @cached_property
+    def org_ids_mapping(self):
+        """Mapping of ORCiD org IDs to affiliation IDs."""
+        org_ids_mapping_file = self._org_ids_mapping_file or current_app.config.get(
+            "VOCABULARIES_ORCID_ORG_IDS_MAPPING_PATH"
+        )
+        if org_ids_mapping_file:
+            org_ids_mapping_file = Path(org_ids_mapping_file)
+            # If the path is relative, prepend the instance path
+            if not org_ids_mapping_file.is_absolute():
+                org_ids_mapping_file = (
+                    Path(current_app.instance_path) / org_ids_mapping_file
+                )
+            with open(org_ids_mapping_file) as fin:
+                result = {}
+                reader = csv.reader(fin)
+
+                # Check if the first row is a header
+                org_scheme, org_id, aff_id = next(reader)
+                if org_scheme.lower() != "org_scheme":
+                    result[(org_scheme, org_id)] = aff_id
+
+                for org_scheme, org_id, aff_id in reader:
+                    result[(org_scheme, org_id)] = aff_id
+
+                return result
+
+        return self._org_ids_mapping or {}
+
+    def __call__(self, org_scheme, org_id):
+        """Map an ORCiD org ID to an affiliation ID."""
+        # By default we know that ROR IDs are linkable
+        if org_scheme == "ROR":
+            return org_id.split("/")[-1]
+        # Otherwise see if we have a mapping from other schemes to an affiliation ID
+        return self.org_ids_mapping.get((org_scheme, org_id))
+
+
 class OrcidTransformer(BaseTransformer):
     """Transforms an ORCiD record into a names record."""
 
@@ -148,30 +196,19 @@ def __init__(
         self,
         *args,
         names_exclude_regex=DEFAULT_NAMES_EXCLUDE_REGEX,
-        affiliation_relation_schemes=None,
-        org_scheme_mappping=None,
+        org_id_to_affiliation_id_func=None,
         **kwargs,
     ) -> None:
         """Constructor."""
         self._names_exclude_regex = names_exclude_regex
-        self._affiliation_relation_schemes = affiliation_relation_schemes
-        self._org_scheme_mappping = org_scheme_mappping
+        self._org_id_to_affiliation_id_func = (
+            org_id_to_affiliation_id_func or OrcidOrgToAffiliationMapper()
+        )
         super().__init__()
 
-    @property
-    def affiliation_relation_schemes(self):
-        """Allowed affiliation schemes."""
-        return self._affiliation_relation_schemes or {"ror"}
-
-    @property
-    def org_scheme_mappping(self):
-        """Mapping of ORCiD org schemes to affiliation schemes."""
-        return self._org_scheme_mappping or {
-            "GRID": "grid",
-            "ROR": "ror",
-            "ISNI": "isni",
-            "FUNDREF": "fundref",
-        }
+    def org_id_to_affiliation_id(self, org_scheme, org_id):
+        """Convert and ORCiD org ID to a linkable affiliation ID."""
+        return self._org_id_to_affiliation_id_func(org_scheme, org_id)
 
     def apply(self, stream_entry, **kwargs):
         """Applies the transformation to the stream entry."""
@@ -212,36 +249,52 @@ def _extract_affiliations(self, record):
             employments = (
                 record.get("activities-summary", {})
                 .get("employments", {})
-                .get("affiliation-group")
+                .get("affiliation-group", [])
             )
+
+            # If there are single values, the XML to dict, doesn't wrap them in a list
             if isinstance(employments, dict):
                 employments = [employments]
 
+            # Remove the "employment-summary" nesting
+            employments = [
+                employment.get("employment-summary", {}) for employment in employments
+            ]
+
             history = set()
             for employment in employments:
-                terminated = employment["employment-summary"].get("end-date")
-                org = employment["employment-summary"]["organization"]
-                if org["name"] not in history and not terminated:
-                    history.add(org["name"])
-                    aff = {"name": org["name"]}
+                terminated = employment.get("end-date")
+                org = employment["organization"]
 
-                    if org.get("disambiguated-organization"):
-                        dis_org = org["disambiguated-organization"]
-                        org_id = dis_org.get("disambiguated-organization-identifier")
-                        org_scheme = dis_org.get("disambiguation-source")
-                        aff_scheme = self.org_scheme_mappping.get(org_scheme)
+                if terminated or org["name"] in history:
+                    continue
 
-                        if org_id and aff_scheme in self.affiliation_relation_schemes:
-                            if aff_scheme == "ror":
-                                org_id = org_id.split("/")[-1]
+                history.add(org["name"])
+                aff = {"name": org["name"]}
 
-                            aff["id"] = org_id
+                # Extract the org ID, to link to the affiliation vocabulary
+                aff_id = self._extract_affiliation_id(org)
+                if aff_id:
+                    aff["id"] = aff_id
 
-                    result.append(aff)
+                result.append(aff)
         except Exception:
             pass
         return result
 
+    def _extract_affiliation_id(self, org):
+        """Extract the affiliation ID from an ORCiD organization."""
+        dis_org = org.get("disambiguated-organization")
+        if not dis_org:
+            return
+
+        aff_id = None
+        org_id = dis_org.get("disambiguated-organization-identifier")
+        org_scheme = dis_org.get("disambiguation-source")
+        if org_id and org_scheme:
+            aff_id = self.org_id_to_affiliation_id(org_scheme, org_id)
+        return aff_id
+
 
 class NamesServiceWriter(ServiceWriter):
     """Names service writer."""
diff --git a/tests/contrib/names/test_names_datastreams.py b/tests/contrib/names/test_names_datastreams.py
index 46c89cc6..3abd82ee 100644
--- a/tests/contrib/names/test_names_datastreams.py
+++ b/tests/contrib/names/test_names_datastreams.py
@@ -8,6 +8,8 @@
 
 """Names data streams tests."""
 
+import os
+import tempfile
 from copy import deepcopy
 from unittest.mock import patch
 
@@ -23,6 +25,7 @@
 )
 from invenio_vocabularies.datastreams import StreamEntry
 from invenio_vocabularies.datastreams.errors import TransformerError, WriterError
+from invenio_vocabularies.datastreams.transformers import XMLTransformer
 
 
 @pytest.fixture(scope="function")
@@ -43,17 +46,8 @@ def name_full_data():
     }
 
 
-@pytest.fixture(scope="module")
-def expected_from_xml():
-    return {
-        "id": "0000-0001-8135-3489",
-        "given_name": "Lars Holm",
-        "family_name": "Nielsen",
-        "identifiers": [{"scheme": "orcid", "identifier": "0000-0001-8135-3489"}],
-        "affiliations": [{"id": "01ggx4157", "name": "CERN"}],
-    }
-
-
+# NOTE: This is a simplified version of the original XML data from ORCiD. Sections like
+#       works, funding, educations, etc. and attributes have been removed.
 XML_ENTRY_DATA = b"""
 <record:record>
     <common:orcid-identifier>
@@ -66,7 +60,6 @@ def expected_from_xml():
             <personal-details:given-names>Lars Holm</personal-details:given-names>
             <personal-details:family-name>Nielsen</personal-details:family-name>
         </person:name>
-        <other-name:other-names>
     </person:person>
     <activities:activities-summary>
         <activities:employments>
@@ -89,6 +82,21 @@ def expected_from_xml():
             </activities:affiliation-group>
             <activities:affiliation-group>
                 <employment:employment-summary>
+                    <common:start-date>
+                        <common:year>2024</common:year>
+                        <common:month>01</common:month>
+                    </common:start-date>
+                    <common:organization>
+                        <common:name>ACME Inc.</common:name>
+                        <common:disambiguated-organization>
+                            <common:disambiguated-organization-identifier>grid.123456.z</common:disambiguated-organization-identifier>
+                            <common:disambiguation-source>GRID</common:disambiguation-source>
+                        </common:disambiguated-organization>
+                    </common:organization>
+                </employment:employment-summary>
+            </activities:affiliation-group>
+            <activities:affiliation-group>
+                <employment:employment-summary >
                     <common:start-date>
                         <common:year>2007</common:year>
                         <common:month>08</common:month>
@@ -100,22 +108,53 @@ def expected_from_xml():
                     <common:organization>
                         <common:name>European Southern Observatory</common:name>
                         <common:disambiguated-organization>
-                            <common:disambiguated-organization-identifier>54249</common:disambiguated-organization-identifier>
-                            <common:disambiguation-source>RINGGOLD</common:disambiguation-source>
+                            <common:disambiguated-organization-identifier>grid.424907.c</common:disambiguated-organization-identifier>
+                            <common:disambiguation-source>GRID</common:disambiguation-source>
                         </common:disambiguated-organization>
                     </common:organization>
                 </employment:employment-summary>
             </activities:affiliation-group>
         </activities:employments>
     </activities:activities-summary>
-<record:record>
+</record:record>
 """
 
-
-@pytest.fixture(scope="function")
-def bytes_xml_data():
-    # simplified version of an XML file of the ORCiD dump
-    return XML_ENTRY_DATA
+XML_ENTRY_DATA_SINGLE_EMPLOYMENT = b"""
+<record:record>
+    <common:orcid-identifier>
+        <common:uri>https://orcid.org/0000-0001-8135-3489</common:uri>
+        <common:path>0000-0001-8135-3489</common:path>
+        <common:host>orcid.org</common:host>
+    </common:orcid-identifier>
+    <person:person>
+        <person:name>
+            <personal-details:given-names>Lars Holm</personal-details:given-names>
+            <personal-details:family-name>Nielsen</personal-details:family-name>
+        </person:name>
+    </person:person>
+    <activities:activities-summary>
+        <activities:employments>
+            <activities:affiliation-group>
+                <employment:employment-summary>
+                    <common:start-date>
+                        <common:year>2012</common:year>
+                        <common:month>03</common:month>
+                        <common:day>16</common:day>
+                    </common:start-date>
+                    <common:organization>
+                        <common:name>CERN</common:name>
+                        <common:disambiguated-organization>
+                            <common:disambiguated-organization-identifier>https://ror.org/01ggx4157
+</common:disambiguated-organization-identifier>
+                            <common:disambiguation-source>ROR</common:disambiguation-source>
+                        </common:disambiguated-organization>
+                    </common:organization>
+                </employment:employment-summary>
+            </activities:affiliation-group>
+        </activities:employments>
+    </activities:activities-summary>
+</record:record>
+"""
 
 
 NAMES_TEST = {
@@ -128,77 +167,192 @@ def bytes_xml_data():
 
 
 @pytest.fixture(scope="function")
-def dict_xml_entry():
-    return StreamEntry(
+def orcid_data():
+    base = {
+        "orcid-identifier": {
+            "uri": "https://orcid.org/0000-0001-8135-3489",
+            "path": "0000-0001-8135-3489",
+            "host": "orcid.org",
+        },
+        "person": {
+            "name": {"given-names": "Lars Holm", "family-name": "Nielsen"},
+        },
+    }
+
+    employments = [
         {
-            "orcid-identifier": {
-                "uri": "https://orcid.org/0000-0001-8135-3489",
-                "path": "0000-0001-8135-3489",
-                "host": "orcid.org",
-            },
-            "person": {
-                "name": {
-                    "given-names": "Lars Holm",
-                    "family-name": "Nielsen",
-                    "@visibility": "public",
-                    "@path": "0000-0001-8135-3489",
+            "employment-summary": {
+                "organization": {
+                    "name": "CERN",
+                    "disambiguated-organization": {
+                        "disambiguated-organization-identifier": "https://ror.org/01ggx4157",
+                        "disambiguation-source": "ROR",
+                    },
                 },
-                "external-identifiers": {
-                    "@path": "/0000-0001-8135-3489/external-identifiers"
+                "start-date": {"year": "2012", "month": "03", "day": "16"},
+            }
+        },
+        {
+            "employment-summary": {
+                "organization": {
+                    "name": "ACME Inc.",
+                    "disambiguated-organization": {
+                        "disambiguated-organization-identifier": "grid.123456.z",
+                        "disambiguation-source": "GRID",
+                    },
                 },
-                "@path": "/0000-0001-8135-3489/person",
-            },
-            "activities-summary": {
-                "employments": {
-                    "affiliation-group": {
-                        "employment-summary": {
-                            "organization": {
-                                "name": "CERN",
-                                "disambiguated-organization": {
-                                    "disambiguated-organization-identifier": "https://ror.org/01ggx4157",
-                                    "disambiguation-source": "ROR",
-                                },
-                            }
-                        }
+                "start-date": {"year": "2024", "month": "01"},
+            }
+        },
+        {
+            "employment-summary": {
+                "organization": {
+                    "name": "European Southern Observatory",
+                    "disambiguated-organization": {
+                        "disambiguated-organization-identifier": "grid.424907.c",
+                        "disambiguation-source": "GRID",
                     },
-                    "@path": "/0000-0001-8135-3489/employments",
                 },
-                "@path": "/0000-0001-8135-3489/activities",
+                "start-date": {"year": "2007", "month": "08"},
+                "end-date": {"year": "2012", "month": "03"},
+            },
+        },
+    ]
+
+    return {
+        "xml": {
+            "multi_employment": XML_ENTRY_DATA,
+            "single_employment": XML_ENTRY_DATA_SINGLE_EMPLOYMENT,
+        },
+        "json": {
+            "multi_employment": {
+                **base,
+                "activities-summary": {
+                    "employments": {"affiliation-group": employments},
+                },
+            },
+            "single_employment": {
+                **base,
+                "activities-summary": {
+                    # NOTE: Because the XML data has only one employment, the
+                    # transformer will not create a list of employments, but instead a
+                    # single employment object.
+                    "employments": {"affiliation-group": employments[0]}
+                },
             },
-            "@path": "/0000-0001-8135-3489",
-        }
-    )
+        },
+    }
+
+
+@pytest.fixture(scope="module")
+def expected_from_xml():
+    base = {
+        "id": "0000-0001-8135-3489",
+        "given_name": "Lars Holm",
+        "family_name": "Nielsen",
+        "identifiers": [{"scheme": "orcid", "identifier": "0000-0001-8135-3489"}],
+    }
+
+    return {
+        "multi_employment": {
+            **base,
+            "affiliations": [
+                {"id": "01ggx4157", "name": "CERN"},
+                # NOTE: GRID identifiers do not result in linked affiliations
+                {"name": "ACME Inc."},
+                # NOTE: terminated employments are not included in the affiliations
+            ],
+        },
+        "single_employment": {
+            **base,
+            "affiliations": [{"id": "01ggx4157", "name": "CERN"}],
+        },
+    }
+
+
+@pytest.fixture(scope="function")
+def org_ids_mapping_file_config(app):
+    """ORCiD organization IDs mappings CSV file."""
+    fout = tempfile.NamedTemporaryFile(mode="w", delete=False)
+    fout.write('"GRID","grid.123456.z","acme_inc_id"\n')
+    fout.close()
 
+    old_config = app.config.get("VOCABULARIES_ORCID_ORG_IDS_MAPPING_PATH")
+    app.config["VOCABULARIES_ORCID_ORG_IDS_MAPPING_PATH"] = fout.name
+    yield
 
-def test_orcid_transformer(dict_xml_entry, expected_from_xml):
+    app.config["VOCABULARIES_ORCID_ORG_IDS_MAPPING_PATH"] = old_config
+    os.unlink(fout.name)
+
+
+def test_orcid_xml_transform(orcid_data):
+    """Test XML transformer with ORCiD record.
+
+    NOTE: this might look somewhat "redundant", since we're again testing if the
+    XMLTransformer works as expected. However, this is a more specific test
+    that demonstrates how the transformer behaves with more complex XML data, that
+    can have e.g. nested elements that can be arrays or objects.
+    """
+    xml_data = orcid_data["xml"]
+    json_data = orcid_data["json"]
+
+    for key, data in xml_data.items():
+        transformer = XMLTransformer(root_element="record")
+        transform_result = transformer.apply(StreamEntry(data)).entry
+        assert transform_result == json_data[key]
+
+
+def test_orcid_transformer(app, orcid_data, expected_from_xml):
+    """Test ORCiD transformer data."""
     transformer = OrcidTransformer()
-    assert expected_from_xml == transformer.apply(dict_xml_entry).entry
+    input_data = orcid_data["json"]
+
+    for key, data in input_data.items():
+        assert expected_from_xml[key] == transformer.apply(StreamEntry(data)).entry, key
+
+
+def test_orcid_transformer_org_ids_mapping_from_file(
+    app,
+    orcid_data,
+    expected_from_xml,
+    org_ids_mapping_file_config,
+):
+    """Test ORCiD transformer data with org IDs mapping file."""
+    transformer = OrcidTransformer()
+    input_data = orcid_data["json"]["multi_employment"]
+    expected = deepcopy(expected_from_xml["multi_employment"])
+    expected["affiliations"] = [
+        {"id": "01ggx4157", "name": "CERN"},
+        {"id": "acme_inc_id", "name": "ACME Inc."},
+    ]
+
+    assert expected == transformer.apply(StreamEntry(input_data)).entry
 
 
 @pytest.mark.parametrize("name,is_valid_name", NAMES_TEST.items())
-def test_orcid_transformer_name_filtering(dict_xml_entry, name, is_valid_name):
+def test_orcid_transformer_name_filtering(orcid_data, name, is_valid_name):
     transformer = OrcidTransformer()
-    val = deepcopy(dict_xml_entry)
+    val = deepcopy(orcid_data["json"]["multi_employment"])
     if is_valid_name:
-        assert transformer.apply(val).entry
+        assert transformer.apply(StreamEntry(val)).entry
     else:
         with pytest.raises(TransformerError):
-            val.entry["person"]["name"]["given-names"] = name
-            val.entry["person"]["name"]["family-name"] = ""
-            transformer.apply(val)
+            val["person"]["name"]["given-names"] = name
+            val["person"]["name"]["family-name"] = ""
+            transformer.apply(StreamEntry(val))
         with pytest.raises(TransformerError):
-            val.entry["person"]["name"]["given-names"] = ""
-            val.entry["person"]["name"]["family-name"] = name
-            transformer.apply(val)
+            val["person"]["name"]["given-names"] = ""
+            val["person"]["name"]["family-name"] = name
+            transformer.apply(StreamEntry(val))
 
 
 @pytest.mark.parametrize("name", NAMES_TEST.keys())
-def test_orcid_transformer_different_names_no_regex(dict_xml_entry, name):
+def test_orcid_transformer_different_names_no_regex(orcid_data, name):
     transformer = OrcidTransformer(names_exclude_regex=None)
-    val = deepcopy(dict_xml_entry)
-    val.entry["person"]["name"]["given-names"] = name
-    val.entry["person"]["name"]["family-name"] = ""
-    assert transformer.apply(val).entry
+    val = deepcopy(orcid_data["json"]["multi_employment"])
+    val["person"]["name"]["given-names"] = name
+    val["person"]["name"]["family-name"] = ""
+    assert transformer.apply(StreamEntry(val)).entry
 
 
 class MockResponse:
@@ -207,14 +361,14 @@ class MockResponse:
 
 
 @patch("requests.get", side_effect=lambda url, headers: MockResponse())
-def test_orcid_http_reader(_, bytes_xml_data):
+def test_orcid_http_reader(_):
     reader = OrcidHTTPReader(id="0000-0001-8135-3489")
     results = []
     for entry in reader.read():
         results.append(entry)
 
     assert len(results) == 1
-    assert bytes_xml_data == results[0]
+    assert XML_ENTRY_DATA == results[0]
 
 
 def test_names_service_writer_create(app, db, search_clear, name_full_data):
diff --git a/tests/datastreams/test_transformers.py b/tests/datastreams/test_transformers.py
index 8c5b1943..dd7628d7 100644
--- a/tests/datastreams/test_transformers.py
+++ b/tests/datastreams/test_transformers.py
@@ -18,26 +18,51 @@
 @pytest.fixture(scope="module")
 def expected_from_xml():
     return {
-        "record": {
-            "field_one": "value",
-            "multi_field": {"some": "value", "another": "value too"},
-        }
+        "top_level_field": "top-level single value",
+        "top_level_object_field": {
+            "some": "value",
+            "another": "value too",
+            "nested_array_field": {
+                "@array_attr": "value",
+                "array_element_object": [
+                    {
+                        "@obj_attr": "first",
+                        "element_foo": "value1",
+                        "element_bar": "value1",
+                    },
+                    {
+                        "@obj_attr": "second",
+                        "element_foo": "value2",
+                        "element_bar": "value2",
+                        "element_qux": "value2",
+                    },
+                ],
+            },
+        },
     }
 
 
 def test_xml_transformer(expected_from_xml):
     bytes_xml_entry = StreamEntry(
-        bytes(
-            '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n'
-            "<record:record>\n"
-            "    <field_one:field_one>value</field_one:field_one>\n"
-            "    <multi_field:multi_field>\n"
-            "        <some:some>value</some:some>\n"
-            "        <another:another>value too</another:another>\n"
-            "    </multi_field:multi_field>\n"
-            "</record:record>\n",
-            encoding="raw_unicode_escape",
-        )
+        b"""
+        <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+        <ns:top_level_field>top-level single value</ns:top_level_field>
+        <ns:top_level_object_field>
+            <ns:some>value</ns:some>
+            <ns:another>value too</ns:another>
+            <ns:nested_array_field array_attr="value">
+                <ns:array_element_object obj_attr="first">
+                    <ns:element_foo>value1</ns:element_foo>
+                    <ns:element_bar>value1</ns:element_bar>
+                </ns:array_element_object>
+                <ns:array_element_object obj_attr="second">
+                    <ns:element_foo>value2</ns:element_foo>
+                    <ns:element_bar>value2</ns:element_bar>
+                    <ns:element_qux>value2</ns:element_qux>
+                </ns:array_element_object>
+            </ns:nested_array_field array_attr="value">
+        </ns:top_level_object_field>
+        """
     )
 
     transformer = XMLTransformer()
@@ -46,15 +71,25 @@ def test_xml_transformer(expected_from_xml):
 
 def test_bad_xml_transformer():
     bytes_xml_entry = StreamEntry(
-        bytes(
-            '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n'
-            "<field_one:field_one>value</field_one:field_one>\n"
-            "<multi_field:multi_field>\n"
-            "    <some:some>value</some:some>\n"
-            "    <another:another>value too</another:another>\n"
-            "</multi_field:multi_field>\n",
-            encoding="raw_unicode_escape",
-        )
+        b"""
+        <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+        <ns:top_level_field>top-level single value</ns:top_level_field>
+        <ns:top_level_object_field>
+            <ns:some>value</ns:some>
+            <ns:another>value too</ns:another>
+            <ns:nested_array_field array_attr="value">
+                <ns:array_element_object obj_attr="first">
+                    <ns:element_foo>value1</ns:element_foo>
+                    <ns:element_bar>value1</ns:element_bar>
+                </ns:array_element_object>
+                <ns:array_element_object obj_attr="second">
+                    <ns:element_foo>value2</ns:element_foo>
+                    <ns:element_bar>value2</ns:element_bar>
+                    <ns:element_qux>value2</ns:element_qux>
+                </ns:array_element_object>
+            </ns:nested_array_field array_attr="value">
+        </ns:top_level_object_field>
+        """
     )
 
     transformer = XMLTransformer(root_element="field_two")