From 996b150e4d30cfeb75405c52c54a4fbfff909e24 Mon Sep 17 00:00:00 2001 From: RepoDynamicsBot <80158628+AAriam@users.noreply.github.com> Date: Fri, 11 Oct 2024 06:36:03 +0200 Subject: [PATCH] update --- docs/TODO.md | 2 + pyproject.toml | 4 +- src/licenseman/__init__.py | 3 + src/licenseman/data/__init__.py | 12 +- src/licenseman/spdx/__init__.py | 152 ++++++++ src/licenseman/spdx/license.py | 354 ++++++++++++++++++ src/licenseman/spdx/license_db.py | 25 ++ src/licenseman/spdx/license_list.py | 43 +++ .../{license.py => spdx/license_text.py} | 327 +++++++++------- 9 files changed, 776 insertions(+), 146 deletions(-) create mode 100644 docs/TODO.md create mode 100644 src/licenseman/spdx/__init__.py create mode 100644 src/licenseman/spdx/license.py create mode 100644 src/licenseman/spdx/license_db.py create mode 100644 src/licenseman/spdx/license_list.py rename src/licenseman/{license.py => spdx/license_text.py} (68%) diff --git a/docs/TODO.md b/docs/TODO.md new file mode 100644 index 0000000..c7a00e8 --- /dev/null +++ b/docs/TODO.md @@ -0,0 +1,2 @@ +- Notify SPDX about mismatches and missing info in license data (build db and run verification) +- Notify PyPA trove-classifiers about LicenseMan (https://github.com/pypa/trove-classifiers/issues/17) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 16f2e75..59b548b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,13 +17,15 @@ namespaces = true # ----------------------------------------- Project Metadata ------------------------------------- # [project] -version = "0.0.0.dev1" +version = "0.0.0.dev2" name = "LicenseMan" requires-python = ">=3.10" dependencies = [ "LoggerMan == 0.0.0.dev49", "PyLinks", "PkgData", + "PySerials", "MDit == 0.0.0.dev20", "ExceptionMan == 0.0.0.dev20", + "platformdirs >= 4.3, < 5", ] diff --git a/src/licenseman/__init__.py b/src/licenseman/__init__.py index e69de29..b1246a1 100644 --- a/src/licenseman/__init__.py +++ b/src/licenseman/__init__.py @@ -0,0 +1,3 @@ +from loggerman import logger + +from licenseman import spdx diff --git a/src/licenseman/data/__init__.py b/src/licenseman/data/__init__.py index 3291611..7c429bd 100644 --- a/src/licenseman/data/__init__.py +++ b/src/licenseman/data/__init__.py @@ -5,11 +5,12 @@ from pathlib import Path as _Path import pkgdata as _pkgdata +import pyserials as _ps -__all__ = ["get"] +__all__ = ["get_filepath"] -def get(relative_path: str) -> _Path: +def get_filepath(relative_path: str) -> _Path: """Get the absolute path to a package data file. Parameters @@ -27,3 +28,10 @@ def get(relative_path: str) -> _Path: path_absolute=filepath, ) return filepath + + +def spdx_to_trove_mapping() -> dict[str, str]: + """Get the SPDX to Trove classifier mapping.""" + rel_path = "spdx/trove_classifiers.yaml" + abs_path = get_filepath(rel_path) + return _ps.read.yaml_from_file(abs_path) diff --git a/src/licenseman/spdx/__init__.py b/src/licenseman/spdx/__init__.py new file mode 100644 index 0000000..0096651 --- /dev/null +++ b/src/licenseman/spdx/__init__.py @@ -0,0 +1,152 @@ +from pathlib import Path as _Path +import json as _json +import platformdirs as _platdir +import pylinks as _pl +from pylinks.exception.api import WebAPIError as _WebAPIError + +from licenseman.spdx.license_db import SPDXLicenseDB +from licenseman.spdx.license_list import SPDXLicenseList +from licenseman.spdx.license import SPDXLicense +from licenseman import logger + + +URL_TEMPLATE_LICENSE_XML = "https://raw.githubusercontent.com/spdx/license-list-data/refs/heads/main/license-list-XML/{}.xml" +URL_TEMPLATE_LICENSE_JSON = "https://raw.githubusercontent.com/spdx/license-list-data/refs/heads/main/json/details/{}.json" +URL_LICENSE_LIST = "https://spdx.org/licenses/licenses.json" + + +def license_db( + path: str | _Path | None = _platdir.site_cache_path( + appauthor="RepoDynamics", + appname="LicenseMan", + ) / "SPDX_DB", + force_update: bool = False, + verify_updates: bool = True, +) -> SPDXLicenseDB: + db_path = _Path(path) + db_license_path = db_path / "licenses" + license_list_ = _get_global_license_list() + license_ids = license_list_.license_ids + if force_update or not db_path.is_dir(): + missing_ids = license_ids + intro = "Force update is enabled" if force_update else f"SPDX license database not found at {db_path}" + logger.log( + "info" if force_update else "notice", + "SPDX License Database Load", + f"{intro}; downloading all latest SPDX license data." + ) + else: + missing_ids = [] + for license_id in license_ids: + if not (db_license_path / f"{license_id}.json").is_file(): + missing_ids.append(license_id) + if not missing_ids: + logger.success( + "SPDX License Database Load", + f"Loaded database from {db_path}; all {len(license_ids)} license files found." + ) + return SPDXLicenseDB(license_list_, db_path) + num_missing = len(missing_ids) + num_available = len(license_ids) - num_missing + logger.log( + "notice", + "SPDX License Database Load", + f"Loaded database from {db_path}; " + f"found {num_missing} missing license files (available: {num_available})." + ) + db_license_path.mkdir(parents=True, exist_ok=True) + for missing_id in missing_ids: + output_path = db_license_path / f"{missing_id}.json" + license_data = license(missing_id, verify=verify_updates) + with open(output_path, "w") as f: + _json.dump(license_data.raw_data, f) + logger.success( + "SPDX License Database Update", + f"Downloaded '{missing_id}' to 'file://{output_path}'.", + ) + return SPDXLicenseDB(license_list_, db_path) + + +def license_list() -> SPDXLicenseList: + """Get the latest version of the [SPDX license list](https://spdx.org/licenses/) from SPDX website.""" + data = _pl.http.request(URL_LICENSE_LIST, response_type="json") + return SPDXLicenseList(data) + + +def license(license_id: str, verify: bool = True) -> SPDXLicense: + """Get an SPDX license. + + Parameters + ---------- + license_id + SPDX license ID, e.g., 'MIT', 'GPL-2.0-or-later'. + """ + data = license_json(license_id) + data["xml"] = license_xml(license_id) + license_list_ = _get_global_license_list() + for list_entry_key, list_entry_val in license_list_[license_id].items(): + # 'detailsUrl', 'reference', 'referenceNumber' are not present in JSON data + if list_entry_key not in data: + data[list_entry_key] = list_entry_val + logger.info( + "SPDX JSON License Load", + f"Added missing '{list_entry_key}' entry to '{license_id}' JSON data from license list." + ) + elif data[list_entry_key] != list_entry_val: + logger.warning( + "SPDX JSON License Load", + f"Mismatched '{list_entry_key}' entry in '{license_id}' JSON data.", + "JSON content:", + logger.pretty(data[list_entry_key]), + "License list content:", + logger.pretty(list_entry_val), + ) + return SPDXLicense(data, verify=verify) + + +def license_xml(license_id: str) -> str: + """Get an SPDX license definition in XML format from SPDX + [license-list-data](https://github.com/spdx/license-list-data) repository. + + Parameters + ---------- + license_id + SPDX license ID, e.g., 'MIT', 'GPL-2.0-or-later'. + """ + try: + xml_str = _pl.http.request( + URL_TEMPLATE_LICENSE_XML.format(license_id), + response_type="str" + ) + except _WebAPIError as e: + raise Exception(f"Error downloading license XML for ID '{license_id}") from e + return xml_str + + +def license_json(license_id: str) -> dict: + """Get an SPDX license definition in XML format from SPDX + [license-list-data](https://github.com/spdx/license-list-data) repository. + + Parameters + ---------- + license_id + SPDX license ID, e.g., 'MIT', 'GPL-2.0-or-later'. + """ + try: + json_data = _pl.http.request( + URL_TEMPLATE_LICENSE_JSON.format(license_id), + response_type="json" + ) + except _WebAPIError as e: + raise Exception(f"Error downloading license JSON for ID '{license_id}") from e + return json_data + + +def _get_global_license_list() -> SPDXLicenseList: + global _LICENSE_LIST + if _LICENSE_LIST is None: + _LICENSE_LIST = license_list() + return _LICENSE_LIST + + +_LICENSE_LIST: SPDXLicenseList | None = None \ No newline at end of file diff --git a/src/licenseman/spdx/license.py b/src/licenseman/spdx/license.py new file mode 100644 index 0000000..36ef04a --- /dev/null +++ b/src/licenseman/spdx/license.py @@ -0,0 +1,354 @@ +from __future__ import annotations as _annotations + +from typing import TYPE_CHECKING as _TYPE_CHECKING +import datetime as _dt +from xml.etree import ElementTree as _ElementTree +from dataclasses import dataclass as _dataclass + +from licenseman import logger as _logger +from licenseman.spdx.license_text import SPDXLicenseTextPlain + +if _TYPE_CHECKING: + from typing import Literal, Any + + +@_dataclass +class SPDXLicenseCrossRef: + """SPDX License cross reference.""" + url: str + order: int + timestamp: _dt.datetime + match: str + valid: bool + live: bool + wayback: bool + + +class SPDXLicense: + """SPDX License definition. + + Parameters + ---------- + xml + SPDX license XML content as a string. + + References + ---------- + - [SPDX Docs](https://github.com/spdx/license-list-XML/blob/main/DOCS/README.md) + - [SPDX Docs - XML Fields](https://github.com/spdx/license-list-XML/blob/main/DOCS/xml-fields.md) + - [XML Schema](https://github.com/spdx/license-list-XML/blob/main/schema/ListedLicense.xsd) + - [GitHub Repository](https://github.com/spdx/license-list-XML) + """ + + def __init__(self,data: dict, verify: bool = True): + try: + root = _ElementTree.fromstring(data["xml"]) + except _ElementTree.ParseError as e: + raise Exception(f"Error parsing license XML content.") from e + self._ns: dict = {'': 'http://www.spdx.org/license'} + self._xml: _ElementTree.Element = root.find('license', self._ns) + self._data: dict = data + if verify: + self._check_integrity() + return + + def _check_integrity(self): + + def log(key_json: str, missing_in: Literal["xml", "json"], data: Any, key_xml: str | None = None): + if key_xml is None: + key_xml = key_json + if missing_in == "xml": + missing_source = "XML" + existing_source = "JSON" + missing_key = key_xml + existing_key = key_json + else: + missing_source = "JSON" + existing_source = "XML" + missing_key = key_json + existing_key = key_xml + _logger.notice( + log_title, + f"The value of '{missing_key}' is not defined in the {missing_source} data. " + f"Using the {existing_source} data value of '{existing_key}':", + _logger.pretty(data) + ) + return + + def osi_approved(): + key = "isOsiApproved" + xml_raw = self._xml.attrib.get(key) + if xml_raw == "true": + xml = True + elif xml_raw == "false": + xml = False + else: + if xml_raw is not None: + raise Exception(f"Invalid value for '{key}' in XML data: {xml_raw}") + xml = None + json = self.osi_approved + if json != xml: + if xml is None: + log(key, "xml", json) + return + if json is None: + log(key, "json", xml) + self._data[key] = xml + return + raise Exception( + "OSI approved mismatch between XML and JSON data. " + f"XML: {xml}, JSON: {self.osi_approved}" + ) + return + + def deprecated_version(): + key_xml = "licenseVersionDeprecated" + key_json = "deprecatedVersion" + xml = self._xml.attrib.get(key_xml) + json = self._data.get(key_json) + if json != xml: + if xml is None: + log(key_json=key_json, key_xml=key_xml, missing_in="xml", data=json) + elif json is None: + log(key_json=key_json, key_xml=key_xml, missing_in="json", data=xml) + self._data[key_json] = xml + else: + raise Exception( + "Deprecated version mismatch between XML and JSON data. " + f"XML: {xml}, JSON: {json}" + ) + return + + def cross_refs(): + xml_elem = self._xml.find('crossRefs', self._ns) + xml = sorted( + [ref.text.strip() for ref in xml_elem.findall('crossRef', self._ns)] + ) if xml_elem else [] + json = sorted([ref["url"] for ref in self._data.get("crossRef", [])]) + json_seealso = sorted(self._data.get("seeAlso", [])) + if json != json_seealso: + raise Exception( + "Cross references mismatch between 'crossRefs' and 'seeAlso' JSON data. ", + f"CrossRefs: {json}, SeeAlso: {json_seealso}" + ) + if json != xml: + if not xml: + log("crossRef", "xml", data=json) + return + raise Exception( + "Cross references mismatch between XML and JSON data. " + f"XML: {xml}, JSON: {json}" + ) + + log_title = f"{self.id} License Verification" + if self._data["licenseId"] != self._xml.attrib.get('licenseId'): + raise Exception("License ID mismatch between XML and JSON data.") + if self._data["name"] != self._xml.attrib.get('name'): + raise Exception("License name mismatch between XML and JSON data.") + osi_approved() + deprecated_version() + cross_refs() + return + + def generate_text(self) -> str: + return + + @property + def raw_data(self) -> dict: + """Raw license data.""" + return self._data + + @property + def id(self) -> str: + """SPDX license ID.""" + return self._data["licenseId"] + + @property + def name(self) -> str: + """Full name of the license""" + return self._data["name"] + + @property + def text_plain(self) -> str: + """Original license text in plain text format.""" + return self._data["licenseText"] + + @property + def text_html(self) -> str | None: + """Original license text in HTML format.""" + return self._data.get("licenseTextHtml") + + @property + def text_template(self) -> str | None: + """License text template.""" + return self._data.get("standardLicenseTemplate") + + @property + def text_xml(self) -> _ElementTree.Element: + return self._xml.find('text', self._ns) + + @property + def header_plain(self) -> str | None: + """Original license header in plain text format.""" + return self._data.get("standardLicenseHeader") + + @property + def header_html(self) -> str | None: + """Original license header in HTML format.""" + return self._data.get("standardLicenseHeaderHtml") + + @property + def header_template(self) -> str | None: + """License header template.""" + return self._data.get("standardLicenseHeaderTemplate") + + @property + def header_xml(self) -> _ElementTree.Element: + return self._xml.find('.//standardLicenseHeader', self._ns) + + @property + def title_text_xml(self) -> _ElementTree.Element | None: + """Title of the license as defined in the text, if any.""" + return self._xml.find('.//titleText', self._ns) + + @property + def copyright_notice_xml(self) -> _ElementTree.Element | None: + """Copyright notice of the license is defined in the text, if any.""" + return self._xml.find('.//copyrightText', self._ns) + + @property + def optionals_xml(self) -> list[_ElementTree.Element]: + """Optional fields in the license text, if any.""" + return self._xml.findall('.//optional', self._ns) + + @property + def alts(self) -> dict[str, dict[str, str]]: + """ + + Returns + ------- + A dictionary where keys are the alternative field names, and values are dictionaries with keys: + `text` : str + + Default value. + `match` : str + + Regular expression (RegEx) pattern to validate user input for `text`. + """ + alts = {} + for alt in self._xml.findall('.//alt', self._ns): + alts[alt.attrib['name']] = {'text': alt.text, 'match': alt.attrib['match']} + return alts + + @property + def ref_num(self) -> int: + """Reference number of the license.""" + return self._data["referenceNumber"] + + @property + def ref_url(self) -> str: + """URL to the license reference page at SPDX.org.""" + return self._data["reference"] + + @property + def json_url(self) -> str: + """URL to the license JSON data.""" + return self._data["detailsUrl"] + + @property + def cross_refs(self) -> list[SPDXLicenseCrossRef]: + """URLs to license resources, if any.""" + return [ + SPDXLicenseCrossRef( + url=ref["url"], + order=ref["order"], + timestamp=_dt.datetime.strptime(ref["timestamp"], "%Y-%m-%dT%H:%M:%SZ"), + match=ref["match"], + valid=ref["isValid"], + live=ref["isLive"], + wayback=ref["isWayBackLink"] + ) for ref in self._data.get("crossRef", []) + ] + + @property + def osi_approved(self) -> bool: + """Whether the license is OSI approved. + + Returns + ------- + A boolean, or `None` if the value is not defined in the data. + """ + return self._data["isOsiApproved"] + + @property + def fsf_libre(self) -> bool | None: + """Whether the license is FSF approved. + + Returns + ------- + A boolean, or `None` if the value is not defined in the data. + """ + return self._data.get("isFsfLibre") + + @property + def deprecated(self) -> bool: + """Whether the license is deprecated. + + Returns + ------- + A boolean, or `None` if the value is not defined in the data. + """ + return self._data["isDeprecatedLicenseId"] + + @property + def version_deprecated(self) -> str | None: + """Version of the SPDX License List in which the license was deprecated, if applicable. + + Returns + ------- + Version number string, or `None` if the value is not defined in the data. + """ + return self._data.get("deprecatedVersion") + + @property + def obsoleted_by(self) -> list[dict[str, str]] | None: + """New licenses that obsolete this license, if any. + + Returns + ------- + A list of dictionaries with keys: + `id` : str + + SPDX license ID of the successor license. + `expression` : str + + [SPDX license expression](https://spdx.github.io/spdx-spec/v3.0.1/annexes/spdx-license-expressions/) + which is obsoleted by the successor license; + in most cases, this is the same as the current license's ID, unless the current license + is a complex expression, and only a part of it is obsoleted by the successor. + """ + return [ + {"id": elem.text, "expression": elem.attrib.get("expression")} + for elem in self._xml.findall('.//obsoletedBy', self._ns) + ] + + @property + def version_added(self) -> str | None: + """Version of the SPDX License List in which the license was first added. + + Returns + ------- + Version number string, or `None` if the value is not defined in the data. + """ + return self._xml.attrib.get('licenseVersion') + + @property + def comments(self) -> str | None: + """Comments about the license, if any.""" + return self._data.get("licenseComments") + + @property + def notes(self) -> str | None: + """General comments about the license, if any.""" + elem = self._xml.find('notes', self._ns) + return elem.text if elem is not None else None diff --git a/src/licenseman/spdx/license_db.py b/src/licenseman/spdx/license_db.py new file mode 100644 index 0000000..bcc9c76 --- /dev/null +++ b/src/licenseman/spdx/license_db.py @@ -0,0 +1,25 @@ +from __future__ import annotations as annotations +from typing import TYPE_CHECKING as TYPE_CHECKING + +from licenseman.spdx.license import SPDXLicense as _SPDXLicense + +if TYPE_CHECKING: + from pathlib import Path + from typing import Sequence + from licenseman.spdx.license_list import SPDXLicenseList + + +class SPDXLicenseDB: + + def __init__( + self, + license_list: SPDXLicenseList, + db_path: Path, + ): + self._license_list = license_list + self._db_path = db_path + self._licenses: dict[str, _SPDXLicense] = {} + return + + def alts(self, license_ids: Sequence[str] | None = None): + license_ids = license_ids or self._license_list.license_ids \ No newline at end of file diff --git a/src/licenseman/spdx/license_list.py b/src/licenseman/spdx/license_list.py new file mode 100644 index 0000000..c308f7c --- /dev/null +++ b/src/licenseman/spdx/license_list.py @@ -0,0 +1,43 @@ +from __future__ import annotations + +import datetime as _dt + + +class SPDXLicenseList: + """SPDX license list.""" + + def __init__(self, data: dict): + self._data = data + self._map = {licence["licenseId"]: licence for licence in data["licenses"]} + return + + @property + def licenses(self) -> list[dict]: + """List of SPDX licenses.""" + return self._data["licenses"] + + @property + def license_ids(self) -> list[str]: + """List of SPDX license IDs.""" + return list(self._map.keys()) + + @property + def release_date(self) -> _dt.date: + """Release date of the SPDX license list.""" + return _dt.datetime.fromisoformat(self._data["releaseDate"]).date() + + @property + def version(self) -> str: + """Version of the SPDX license list.""" + return self._data["licenseListVersion"] + + def get(self, key: str) -> dict | None: + """Get a license by its key.""" + + return self._map.get(key) + + def __getitem__(self, key: str) -> dict: + return self._map[key] + + def __contains__(self, key: str) -> bool: + return key in self._map diff --git a/src/licenseman/license.py b/src/licenseman/spdx/license_text.py similarity index 68% rename from src/licenseman/license.py rename to src/licenseman/spdx/license_text.py index 0f54da2..8b2680a 100644 --- a/src/licenseman/license.py +++ b/src/licenseman/spdx/license_text.py @@ -1,107 +1,174 @@ +from __future__ import annotations as _annotations + +from typing import TYPE_CHECKING as _TYPE_CHECKING import re as _re from xml.etree import ElementTree as ET from textwrap import TextWrapper as _TextWrapper -import pylinks as pl -from pylinks.exception.api import WebAPIError as _WebAPIError +if _TYPE_CHECKING: + from typing import Any -class SPDXLicense: - def __init__( - self, - license_id: str, - ): - self._id = license_id - self._data: ET.Element = None - self._ns = {'': 'http://www.spdx.org/license'} +class SPDXLicenseText: + """ + Parses the element from an SPDX license XML and generates a plain-text version of the license. + + Parameters + ---------- + text : xml.etree.ElementTree.Element + The XML element to parse. + + + References + ---------- + - official matcher: https://github.com/spdx/spdx-license-matcher + - third-party matcher: https://github.com/MikeMoore63/spdx_matcher + """ + + def __init__(self, text: ET.Element): + self._text = text + self._ns_uri = 'http://www.spdx.org/license' + self._ns = {'': self._ns_uri} + self._element_processor = { + "titleText": self.title_text, + "copyrightText": self.copyright_text, + "standardLicenseHeader": self.standard_license_header, + "list": self.list, + "p": self.p, + "br": self.br, + "item": self.item, + "bullet": self.bullet, + "optional": self.optional, + "alt": self.alt, + } + self._alt: dict = {} return - def run(self): - self._data = self.get_license() + def generate(self, alts: dict[str, str] | None = None) -> tuple[Any, Any | None]: + """Generate license full text and header. - @property - def xml_url(self) -> str: - return f'https://raw.githubusercontent.com/spdx/license-list-data/refs/heads/main/license-list-XML/{self._id}.xml' + Parameters + ---------- + alts : dict[str, int] | None, optional + A dictionary specifying choices for elements. Keys are 'name' attributes, + and values are the value to use. - def get_license(self) -> ET.Element: + Returns + ------- + The full text of the license, and the license header text, if present. """ - Takes an SPDX license ID, downloads the license XML, and returns a parsed data structure. + self._alt = alts or {} + fulltext = self.generate_full(self._text) + header = self._text.find('.//standardLicenseHeader', self._ns) + notice = (self.generate_notice(header)) if header else None + return fulltext, notice + + def process(self, element: ET.Element) -> str: + tag = self.clean_tag(element.tag) + if tag not in self._element_processor: + raise ValueError(f"Unsupported element: {tag}") + processor = self._element_processor[tag] + return processor(element) - Parameters: - license_id (str): The SPDX license ID (e.g., 'MIT', 'GPL-2.0-or-later'). + def get_alt(self, element: ET.Element) -> str: + """Process an element by selecting the appropriate alternative based on `self._alt`. - Returns: - ElementTree.Element: The license element of the parsed XML tree. + Parameters + ---------- + element : xml.etree.ElementTree.Element + The element. """ - # Construct the URL for the raw XML file in the GitHub repository - try: - data = pl.http.request(self.xml_url, response_type="str") - except _WebAPIError as e: - raise Exception(f"Error downloading license XML for ID '{self._id}") from e - try: - root = ET.fromstring(data) - except ET.ParseError as e: - raise Exception(f"Error parsing XML content for ID '{self._id}'") from e - self._data = root.find('spdx:license', self._ns) - return self._data + name = element.get('name') + match = element.get('match') + if not name: + raise ValueError("Alt element must have a 'name' attribute") + if not match: + raise ValueError("Alt element must have a 'match' attribute") + text = self._alt.get(name) + if not text: + return element.text + if not _re.match(match, text): + raise ValueError(f"Alt element '{name}' does not match '{match}'") + return text + + def clean_tag(self, tag: str) -> str: + """Strip the namespace URI from XML tag. - def fullname(self): - return self._data.attrib.get('name') + Parameters + ---------- + tag + The XML tag with possible namespace. - def osi_approved(self) -> bool | None: - val = self._data.attrib.get('isOsiApproved') - return val == 'true' + Returns + ------- + The tag without namespace. + """ + return tag.removeprefix(f'{{{self._ns_uri}}}') - def cross_refs(self) -> list[str]: - cross_refs = self._data.find('crossRefs', self._ns) - if not cross_refs: - return [] - return [ref.text for ref in cross_refs.findall('crossRef', self._ns)] + @staticmethod + def clean_text(text: str) -> str: + text_norm = _re.sub(r'\s+', ' ', text) + if text_norm == " ": + return "" + return text_norm - def notes(self) -> str: - return self._data.find('notes', self._ns).text + def generate_full(self, text: ET.Element): + ... - def text(self) -> ET.Element: - return self._data.find('text', self._ns) + def generate_notice(self, sandard_license_header: ET.Element): + ... - def header(self) -> ET.Element: - return self._data.find('.//standardLicenseHeader', self._ns) + def title_text(self, element: ET.Element): + ... + def copyright_text(self, element: ET.Element): + ... + def standard_license_header(self, element: ET.Element): + ... -class SPDXLicenseTextParser: - """ - Parses the element from an SPDX license XML and generates a plain-text version of the license. + def list(self, element: ET.Element): + ... + + def p(self, element: ET.Element): + ... + + def br(self, element: ET.Element): + ... + + def item(self, element: ET.Element): + ... + + def bullet(self, element: ET.Element): + ... + + def optional(self, element: ET.Element): + ... + + def alt(self, element: ET.Element): + ... + + +class SPDXLicenseTextPlain(SPDXLicenseText): + """Parses the element from an SPDX license XML and generates a plain-text version of the license. Parameters ---------- - text_element : xml.etree.ElementTree.Element + text : xml.etree.ElementTree.Element The XML element to parse. - """ - def __init__(self, text_element: ET.Element): - self._text = text_element - self._ns_uri = 'http://www.spdx.org/license' - self._ns = {'': self._ns_uri} - self._element_processor = { - "text": self.process_generic, - "titleText": self.process_title_text, - "copyrightText": self.process_copyright_text, - "standardLicenseHeader": self.process_generic, - "list": self.process_list, - "p": self.process_p, - "br": lambda x: "\n\n", - "item": self.process_list_item, - "bullet": self.process_generic, - "optional": self.process_optional, - "alt": self.process_alt, - } + References + ---------- + - official matcher: https://github.com/spdx/spdx-license-matcher + - third-party matcher: https://github.com/MikeMoore63/spdx_matcher + """ + def __init__(self, text: ET.Element): + super().__init__(text) self._title: str | bool = True self._copyright: str | bool = False self._include_optional: bool = True - self._alt: dict = {} self._line_len: int = 88 self._list_item_indent: int = 1 self._list_item_vertical_spacing: int = 1 @@ -112,15 +179,14 @@ def __init__(self, text_element: ET.Element): self._list_bullet_unordered_char: str = "–" self._text_wrapper: _TextWrapper | None = None self._curr_bullet_len: int = 0 - self._alts = [] return - def parse( + def generate( self, title: str | bool = True, copyright: str | bool = False, include_optional: bool = True, - alt: dict[str, str] | None = None, + alts: dict[str, str] | None = None, line_length: int = 88, list_item_indent: int = 2, list_item_vertical_spacing: int = 2, @@ -147,7 +213,7 @@ def parse( If a string, the notice is replaced with the custom string, if a notice is present. include_optional : bool, optional Whether to include elements in the output, by default True. - alt : dict[str, int] | None, optional + alts : dict[str, int] | None, optional A dictionary specifying choices for elements. Keys are 'name' attributes, and values are the value to use. line_length @@ -162,7 +228,6 @@ def parse( self._title = title self._copyright = copyright self._include_optional = include_optional - self._alt = alt or {} self._line_len = line_length self._text_wrapper = _TextWrapper( width=line_length, @@ -179,34 +244,23 @@ def parse( self._list_bullet_prefer_default = list_bullet_prefer_default self._list_bullet_ordered = list_bullet_ordered self._list_bullet_unordered_char = list_bullet_unordered_char - fulltext = self.process_element(self._text).strip("\n").rstrip() + "\n" - header = self._text.find('.//standardLicenseHeader', self._ns) - notice = (self.process_element(header).strip("\n").rstrip() + "\n") if header else None - return fulltext, notice - - def get_processor(self, tag: str) -> callable: - if tag not in self._element_processor: - raise ValueError(f"Unsupported element: {tag}") - return self._element_processor[tag] + fulltext, notice = super().generate(alts) + fulltext_cleaned, notice_cleaned = [ + f"{text.lstrip("\n").rstrip()}\n" if text else "" for text in (fulltext, notice) + ] + return fulltext_cleaned, notice_cleaned - def process_element( - self, - element: ET.Element, - ) -> str: - processor = self.get_processor(self.clean_tag(element.tag)) - return processor(element) + def generate_full(self, text: ET.Element): + return self.generic(text) - def process_text(self, text: str) -> str: - text_norm = _re.sub(r'\s+', ' ', text) - if text_norm == " ": - return "" - return self.wrap_text(text_norm) + def generate_notice(self, standard_license_header: ET.Element): + return self.generic(standard_license_header) - def process_generic( + def generic( self, element: ET.Element, return_list: bool = False, - ) -> str: + ) -> str | list[str]: """Recursively processes an XML element and its children. Parameters @@ -236,23 +290,24 @@ def process_generic( # return "\n\n".join(processed) return _re.sub(r'\n\s*\n\s*\n+', "\n\n", "".join(out)) - def process_title_text(self, element: ET.Element) -> str: + def title_text(self, element: ET.Element) -> str: """Process a element.""" if self._title is False: return "" - title = self.process_generic(element) if self._title is True else self._title - title_lines_centered = [line.strip().center(self._line_len) for line in title.splitlines() if line.strip()] + title = self.generic(element) if self._title is True else self._title + title_lines_centered = [line.strip().center(self._line_len) for line in title.splitlines() if + line.strip()] title_centered = "\n".join(title_lines_centered) return f"{title_centered}\n{'=' * self._line_len}\n\n" - def process_copyright_text(self, element: ET.Element) -> str: + def copyright_text(self, element: ET.Element) -> str: """Process a element.""" if self._copyright is False: return "" - copyright_text = self.process_generic(element) if self._copyright is True else self._copyright + copyright_text = self.generic(element) if self._copyright is True else self._copyright return f"\n\n{copyright_text.strip()}\n\n" - def process_p(self, element: ET.Element) -> str: + def p(self, element: ET.Element) -> str: """ Processes a

element and appends its text to the output. @@ -266,11 +321,13 @@ def process_p(self, element: ET.Element) -> str: out[-1].append(element.text) for child in element: tag_name = self.clean_tag(child.tag) - if tag_name != "bullet" and tag_name not in self._element_processor: + if tag_name not in self._element_processor: raise ValueError(f"Unsupported element: {tag_name}") if tag_name == "br": out.append([]) elif tag_name != "bullet": + # Sometimes the for is placed inside a

element of that item. + # Here we ignore the element since `item()` will handle it. content = self._element_processor[tag_name](child) if content: out[-1].append(content) @@ -286,7 +343,7 @@ def process_p(self, element: ET.Element) -> str: paragraphs.append(self.wrap_text(paragraph_normalized)) return f"\n\n{"\n\n".join(paragraphs)}\n\n" - def process_list(self, elem: ET.Element) -> str: + def list(self, elem: ET.Element) -> str: """ Processes a element containing elements. @@ -304,15 +361,16 @@ def process_list(self, elem: ET.Element) -> str: tag = self.clean_tag(child.tag) if tag != 'item': raise ValueError(f"List element should only contain item elements, not {tag}") - item_str = self.process_list_item(child, idx) - item_str_indented = "\n".join([f"{' ' * self._list_indent}{line}" for line in item_str.splitlines()]) + item_str = self.item(child, idx) + item_str_indented = "\n".join( + [f"{' ' * self._list_indent}{line}" for line in item_str.splitlines()]) items.append(item_str_indented) self._current_list_nesting -= 1 newlines = max(1, self._list_item_vertical_spacing) * "\n" list_str = newlines.join(items) return f"{newlines}{list_str}{newlines}" - def process_list_item(self, elem: ET.Element, idx: int) -> str: + def item(self, elem: ET.Element, idx: int) -> str: bullet_elems = elem.findall("./bullet", self._ns) + elem.findall("./p/bullet", self._ns) if len(bullet_elems) > 1: raise ValueError("Item element should contain at most one bullet element") @@ -334,7 +392,7 @@ def process_list_item(self, elem: ET.Element, idx: int) -> str: for child in elem: tag = self.clean_tag(child.tag) if tag != 'bullet': - child_str = self.process_element(child) + child_str = self.process(child) if child_str: content.append(child_str.lstrip(" ")) if child.tail: @@ -351,7 +409,7 @@ def process_list_item(self, elem: ET.Element, idx: int) -> str: self._curr_bullet_len -= len(bullet) return wrapped - def process_optional(self, element: ET.Element) -> str: + def optional(self, element: ET.Element) -> str: """ Processes an element based on the include_optional flag. @@ -362,9 +420,9 @@ def process_optional(self, element: ET.Element) -> str: """ if not self._include_optional: return "" - return self.process_generic(element) + return self.generic(element) - def process_alt(self, element: ET.Element) -> str: + def alt(self, element: ET.Element) -> str: """Process an element by selecting the appropriate alternative based on `self._alt`. Parameters @@ -372,19 +430,16 @@ def process_alt(self, element: ET.Element) -> str: element : xml.etree.ElementTree.Element The element. """ - name = element.get('name') - match = element.get('match') - if not name: - raise ValueError("Alt element must have a 'name' attribute") - if not match: - raise ValueError("Alt element must have a 'match' attribute") - self._alts.append({"name": name, "match": match, "text": element.text}) - text = self._alt.get(name) - if not text: - return element.text - if not _re.match(match, text): - raise ValueError(f"Alt element '{name}' does not match '{match}'") - return text + return super().get_alt(element) + + def br(self, element: ET.Element) -> str: + return "\n\n" + + def process_text(self, text: str) -> str: + text_norm = _re.sub(r'\s+', ' ', text) + if text_norm == " ": + return "" + return self.wrap_text(text_norm) def wrap_text(self, text: str) -> str: """Wrap text to the specified line length, preserving indentation. @@ -404,20 +459,6 @@ def wrap_text(self, text: str) -> str: wrapped = self._text_wrapper.fill(text) return wrapped - def clean_tag(self, tag: str) -> str: - """Strip the namespace URI from XML tag. - - Parameters - ---------- - tag - The XML tag with possible namespace. - - Returns - ------- - The tag without namespace. - """ - return tag.removeprefix(f'{{{self._ns_uri}}}') - - -def get_all_licenses() -> dict: - return pl.http.request("https://spdx.org/licenses/licenses.json", response_type="json") \ No newline at end of file + def bullet(self, element: ET.Element) -> str: + # This will be only called when a element is defined outside of an , which is not allowed. + raise ValueError("Found a element outside of elements.")