diff --git a/CHANGELOG.md b/CHANGELOG.md index edc331f..8b22017 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,20 @@ Adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). -## [0.14.0] - 2021-09-23 +## [0.14.3] - 2021-10-05 + +### Added + +- Some new methods to `Checksums` +- `Checksums` functions can now accept strings + +## [0.14.2] - 2021-10-05 + +### Fixed + +- Two minor bugs + +## [0.14.1] - 2021-09-23 ### Changed diff --git a/docs/requirements.txt b/docs/requirements.txt index 7efa21d..40230e4 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,4 @@ -sphinx >=4.0,<5.0 +sphinx >=4.2,<5.0 sphinx-rtd-theme >=1.0,<2.0 sphinx-autoapi >=1.8,<2.0 sphinx-copybutton >=0.3,<1.0 diff --git a/poetry.lock b/poetry.lock index b9cdc19..68db7b3 100644 --- a/poetry.lock +++ b/poetry.lock @@ -974,7 +974,7 @@ xml = ["lxml"] [metadata] lock-version = "1.1" python-versions = "^3.8" -content-hash = "73052ed89b6284f62e32764057f8b733e8aa7480b673a538c45897e3741ee220" +content-hash = "ed189399deb8c4e1d058c9e906e78432b8fe7c3503385b0dd9db4d35f8c1e0d9" [metadata.files] alabaster = [ diff --git a/pyproject.toml b/pyproject.toml index fb837da..124c876 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,7 @@ build-backend = "poetry.masonry.api" [tool.poetry] name = "typeddfs" -version = "0.14.2" +version = "0.14.3" description = "Pandas DataFrame subclasses that enforce structure and can self-organize." authors = ["Douglas Myers-Turnbull"] maintainers = ["dmyersturnbull"] @@ -112,7 +112,7 @@ pytest-cov = "^3" flake8 = "^3.9" flake8-docstrings = "^1.5" flake8-bugbear = ">=21" -Sphinx = "^4.0" +Sphinx = "^4.2" sphinx-copybutton = ">=0.4, <1.0" sphinx-autoapi = "^1.5" sphinx-rtd-theme = "^1" diff --git a/tests/test_checksums.py b/tests/test_checksums.py index 118f20c..b96a91e 100644 --- a/tests/test_checksums.py +++ b/tests/test_checksums.py @@ -1,12 +1,39 @@ +from pathlib import Path + import pytest -from typeddfs.checksums import Checksums +from typeddfs.checksums import ChecksumMappingOpt, Checksums +from typeddfs.df_errors import ( + HashContradictsExistingError, + HashExistsError, + HashFilenameMissingError, +) class TestBuilders: - def test(self): + def test_get_algorithm(self): assert Checksums.get_algorithm("sha-256") == "sha256" + def test_update(self): + assert Checksums.get_updated_hashes({}, {}) == ChecksumMappingOpt({}) + original = {Path("cat"): "0x5", "ok": "0x63"} + update = {"cat": None, "other": "wads"} + expected = { + Path("cat").resolve(): None, + Path("ok").resolve(): "0x63", + Path("other").resolve(): "wads", + } + assert Checksums.get_updated_hashes(original, update) == ChecksumMappingOpt(expected) + with pytest.raises(HashExistsError): + Checksums.get_updated_hashes({"x": "5"}, {"x": "5"}, overwrite=False) + assert Checksums.get_updated_hashes({"x": "5"}, {"x": "5"}, overwrite=None) == { + Path("x").resolve(): "5" + } + with pytest.raises(HashContradictsExistingError): + Checksums.get_updated_hashes({"x": "5"}, {"x": "4"}, overwrite=None) + with pytest.raises(HashFilenameMissingError): + Checksums.get_updated_hashes({}, {"x": "4"}, missing_ok=False) + if __name__ == "__main__": pytest.main() diff --git a/tests/test_cli_help.py b/tests/test_cli_help.py index d9fa7fc..5d9d70f 100644 --- a/tests/test_cli_help.py +++ b/tests/test_cli_help.py @@ -2,7 +2,7 @@ import pytest from pandas import Period -from typeddfs import TypedDfs, FileFormat +from typeddfs import FileFormat, TypedDfs from typeddfs.cli_help import DfCliHelp diff --git a/typeddfs/checksums.py b/typeddfs/checksums.py index 52c662e..4b6a36b 100644 --- a/typeddfs/checksums.py +++ b/typeddfs/checksums.py @@ -4,22 +4,23 @@ from __future__ import annotations import hashlib +import os from collections import UserDict from pathlib import Path -from typing import Mapping, Optional +from typing import Callable, Iterable, Mapping, Optional, Sequence, Union import regex -from typeddfs._utils import _DEFAULT_HASH_ALG +from typeddfs._utils import _DEFAULT_HASH_ALG, PathLike from typeddfs.df_errors import ( HashAlgorithmMissingError, HashContradictsExistingError, HashDidNotValidateError, + HashExistsError, HashFileExistsError, HashFileMissingError, HashFilenameMissingError, MultipleHashFilenamesError, - HashExistsError, ) _hex_pattern = regex.compile(r"[A-Ha-h0-9]+", flags=regex.V1) @@ -31,6 +32,32 @@ def __getitem__(self, path: Path) -> str: path = path.resolve() return super().__getitem__(path) + @property + def lines(self) -> Sequence[str]: + return [self.line(p) for p in self.keys()] + + def line(self, path: PathLike) -> str: + path = Path(path) + v = self[path] + return f"{v} *{path.name}" + + +class ChecksumMappingOpt(UserDict[Path, Optional[str]]): + def __getitem__(self, path: Path) -> Optional[str]: + path = path.resolve() + return super().__getitem__(path) + + @property + def lines(self) -> Sequence[str]: + return [self.line(p) for p in self.keys() if self[p] is not None] + + def line(self, path: PathLike) -> Optional[str]: + path = Path(path) + v = self.get(path) + if v is None: + return None + return f"{v} *{path.name}" + class Checksums: @classmethod @@ -40,7 +67,7 @@ def default_algorithm(cls) -> str: @classmethod def add_any_hashes( cls, - path: Path, + path: PathLike, to_file: bool, to_dir: bool, *, @@ -59,6 +86,7 @@ def add_any_hashes( If False, never overwrite either. If None, never overwrite, but ignore if equal to any existing entries. """ + path = Path(path) algorithm = cls.get_algorithm(algorithm) hash_file_path = cls.get_hash_file(path, algorithm=algorithm) hash_dir_path = cls.get_hash_dir(path, algorithm=algorithm) @@ -70,12 +98,12 @@ def add_any_hashes( if to_file: cls._add_file_hash(path, hash_file_path, digest, overwrite=overwrite, dry_run=False) if to_dir: - cls._append_dir_hash(path, hash_dir_path, digest, overwrite=overwrite, dry_run=False) + cls.append_dir_hashes(hash_dir_path, {path: digest}, overwrite=overwrite) return digest @classmethod def add_file_hash( - cls, path: Path, *, algorithm: str = _DEFAULT_HASH_ALG, overwrite: bool = True + cls, path: PathLike, *, algorithm: str = _DEFAULT_HASH_ALG, overwrite: bool = True ) -> str: """ Calculates the hash of ``path`` and adds it to a file ``path+"."+alg``. @@ -88,6 +116,7 @@ def add_file_hash( Returns: The hex-encoded hash """ + path = Path(path) algorithm = cls.get_algorithm(algorithm) hash_path = path.with_suffix(path.suffix + f".{algorithm}") if hash_path.exists() and not overwrite: # check first -- save time @@ -98,7 +127,7 @@ def add_file_hash( @classmethod def append_dir_hash( - cls, path: Path, *, algorithm: str = _DEFAULT_HASH_ALG, overwrite: Optional[bool] = True + cls, path: PathLike, *, algorithm: str = _DEFAULT_HASH_ALG, overwrite: Optional[bool] = True ) -> str: """ Calculates the hash of ``path`` and appends it to a file ``dir/(dir+"."+alg)``. @@ -116,16 +145,17 @@ def append_dir_hash( path = Path(path) hash_path = cls.get_hash_dir(path, algorithm=algorithm) digest = cls.calc_hash(path) - cls._append_dir_hash(path, hash_path, digest, overwrite=overwrite, dry_run=False) + cls.append_dir_hashes(hash_path, {path: digest}, overwrite=overwrite) return digest @classmethod def verify_hash_from_hex( - cls, path: Path, expected: str, *, algorithm: str = _DEFAULT_HASH_ALG + cls, path: PathLike, expected: str, *, algorithm: str = _DEFAULT_HASH_ALG ) -> Optional[str]: """ Verifies a hash directly from a hex string. """ + path = Path(path) algorithm = cls.get_algorithm(algorithm) actual = cls.calc_hash(path, algorithm=algorithm) if actual != expected: @@ -139,13 +169,14 @@ def verify_hash_from_hex( @classmethod def verify_any( cls, - path: Path, + path: PathLike, file_hash: bool, dir_hash: bool, computed: Optional[str], *, algorithm: str = _DEFAULT_HASH_ALG, ) -> Optional[str]: + path = Path(path) if computed is not None: cls.verify_hash_from_hex(path, computed) if file_hash or dir_hash: @@ -159,7 +190,7 @@ def verify_any( @classmethod def verify_hash_from_file( cls, - path: Path, + path: PathLike, hash_path: Path, *, algorithm: Optional[str] = _DEFAULT_HASH_ALG, @@ -176,6 +207,7 @@ def verify_hash_from_file( algorithm: The algorithm in hashlib (ignored if ``computed`` is passed) computed: A pre-computed hex-encoded hash; if set, do not calculate from ``path`` """ + path = Path(path) if algorithm is None: algorithm = cls.guess_algorithm(hash_path) else: @@ -190,7 +222,7 @@ def verify_hash_from_file( @classmethod def verify_file_hash( cls, - path: Path, + path: PathLike, *, algorithm: str = _DEFAULT_HASH_ALG, use_filename: Optional[bool] = None, @@ -217,6 +249,7 @@ def verify_file_hash( HashFileMissingError: If the hash file does not exist HashDidNotValidateError: If the hashes are not equal """ + path = Path(path) hash_path = cls.get_hash_file(path, algorithm=algorithm) algorithm = cls.get_algorithm(algorithm) if not hash_path.exists(): @@ -228,7 +261,7 @@ def verify_file_hash( @classmethod def verify_dir_hash( - cls, path: Path, *, algorithm: str = _DEFAULT_HASH_ALG, computed: Optional[str] = None + cls, path: PathLike, *, algorithm: str = _DEFAULT_HASH_ALG, computed: Optional[str] = None ) -> str: """ Verifies a file against is corresponding per-directory hash file. @@ -249,6 +282,7 @@ def verify_dir_hash( HashVerificationError`: Superclass of ``HashDidNotValidateError`` if the filename is not listed, etc. """ + path = Path(path) hash_path = cls.get_hash_dir(path, algorithm=algorithm) algorithm = cls.get_algorithm(algorithm) if not hash_path.exists(): @@ -261,10 +295,11 @@ def verify_dir_hash( return computed @classmethod - def calc_hash(cls, path: Path, *, algorithm: str = _DEFAULT_HASH_ALG) -> str: + def calc_hash(cls, path: PathLike, *, algorithm: str = _DEFAULT_HASH_ALG) -> str: """ Calculates the hash of a file and returns it, hex-encoded. """ + path = Path(path) algorithm = cls.get_algorithm(algorithm) alg = getattr(hashlib, algorithm)() with path.open("rb") as f: @@ -273,15 +308,19 @@ def calc_hash(cls, path: Path, *, algorithm: str = _DEFAULT_HASH_ALG) -> str: return alg.hexdigest() @classmethod - def parse_hash_file_resolved(cls, path: Path) -> ChecksumMapping: + def parse_hash_file_resolved(cls, path: PathLike) -> ChecksumMapping: """ Reads a hash file. See Also: :meth:`parse_hash_file_generic`. + Args: + path The hash file (e.g. my-dir.sha1) + Returns: A mapping from resolved ``Path`` instances to their hex hashes """ + path = Path(path) return ChecksumMapping( { Path(path.parent, *k.split("/")).resolve(): v @@ -291,7 +330,7 @@ def parse_hash_file_resolved(cls, path: Path) -> ChecksumMapping: @classmethod def parse_hash_file_generic( - cls, path: Path, *, forbid_slash: bool = False + cls, path: PathLike, *, forbid_slash: bool = False ) -> Mapping[str, str]: """ Reads a hash file. @@ -309,6 +348,7 @@ def parse_hash_file_generic( A mapping from raw string filenames to their hex hashes. Any node called ``./`` in the path is stripped. """ + path = Path(path) read = path.read_text(encoding="utf8").splitlines() read = [_hashsum_file_sep.split(s, 1) for s in read] # obviously this means that / can't appear in a node @@ -325,35 +365,41 @@ def parse_hash_file_generic( return kv @classmethod - def get_hash_file(cls, path: Path, *, algorithm: str = _DEFAULT_HASH_ALG) -> Path: + def get_hash_file(cls, path: PathLike, *, algorithm: str = _DEFAULT_HASH_ALG) -> Path: """ Returns the path required for the per-file hash of ``path``. Example: ``Utils.get_hash_file("my_file.txt.gz") # Path("my_file.txt.gz.sha256")`` """ + path = Path(path) algorithm = cls.get_algorithm(algorithm) return path.with_suffix(path.suffix + "." + algorithm) @classmethod - def get_hash_dir(cls, path: Path, *, algorithm: str = _DEFAULT_HASH_ALG) -> Path: + def get_hash_dir(cls, path: PathLike, *, algorithm: str = _DEFAULT_HASH_ALG) -> Path: """ Returns the path required for the per-directory hash of ``path``. Example: ``Utils.get_hash_file(Path("my_dir, my_file.txt.gz")) # Path("my_dir", "my_dir.sha256")`` """ + path = Path(path) algorithm = cls.get_algorithm(algorithm) return path.parent / (path.parent.name + "." + algorithm) @classmethod - def guess_algorithm(cls, path: Path) -> str: + def guess_algorithm(cls, path: PathLike) -> str: """ Guesses the hashlib algorithm used from a hash file. + Args: + path The hash file (e.g. my-file.sha256) + Example: ``Utils.guess_algorithm("my_file.sha1") # "sha1"`` """ + path = Path(path) algorithm = path.suffix.lstrip(".").lower().replace("-", "") try: getattr(hashlib, algorithm) @@ -396,35 +442,147 @@ def _add_file_hash( hash_path.write_text(txt, encoding="utf-8") @classmethod - def _append_dir_hash( - cls, path: Path, hash_path: Path, digest: str, *, overwrite: Optional[bool], dry_run: bool + def delete_dir_hashes( + cls, + hash_path: Path, + delete: Iterable[PathLike], + *, + missing_ok: bool = False, + ): + """ + Strips paths from a dir hash file. + Like :meth:`update_dir_hashes` but less flexible and only for removing paths. + """ + cls.update_dir_hashes( + hash_path, {p: None for p in delete}, missing_ok=missing_ok, overwrite=True + ) + + @classmethod + def append_dir_hashes( + cls, + hash_path: Path, + append: Mapping[PathLike, str], + *, + overwrite: Optional[bool] = False, + ): + """ + Append paths to a dir hash file. + Like :meth:`update_dir_hashes` but less flexible and only for adding paths. + """ + cls.update_dir_hashes(hash_path, append, missing_ok=True, overwrite=overwrite) + + @classmethod + def update_dir_hashes( + cls, + hash_path: PathLike, + update: Union[Callable[[Path], Optional[PathLike]], Mapping[PathLike, Optional[PathLike]]], + *, + missing_ok: bool = True, + overwrite: Optional[bool] = True, + sort: Union[bool, Callable[[Mapping[Path, str]], Mapping[Path, str]]] = False, ) -> None: - txt = f"{digest} *{path.name}" - if hash_path.exists(): - existing = cls.parse_hash_file_resolved(hash_path) - z = existing.get(path.resolve()) - if z is not None: - err = None - if overwrite is False and z == digest: - err = ( - HashExistsError, - f"Hash for {path} already exists in {hash_path} (and it matches)", - ) - if overwrite is False and z != digest: - err = ( - HashExistsError, - f"Hash for {path} already exists in {hash_path} (and it does not match)", - ) - if overwrite is None and z != digest: - err = ( - HashContradictsExistingError, - f"Hash for {path} already exists in {hash_path} but does not match", - ) - if err is not None: - raise err[0](err[1], key=str(path), original=z, new=digest) - elif not dry_run: - with hash_path.open(mode="a", encoding="utf-8") as f: - f.write(txt) + """ + Reads a dir hash file and writes back with new values. + Can add, update, and delete. + + Args: + hash_path: The path of the checksum file (e.g. "dir.sha256") + update: Values to overwrite. + May be a function or a dictionary from paths to values. + If ``None`` is returned, the entry will be removed; + otherwise, updates with the returned hex hash. + missing_ok: Require that the path is already listed + overwrite: Allow overwriting an existing value. + If ``None``, only allow if the hash is the same. + sort: Apply a sorting algorithm afterward. + If ``True``, uses ``sorted``, sorting only on the keys. + """ + hash_path = Path(hash_path) + existing = cls.parse_hash_file_resolved(hash_path) if hash_path.exists() else {} + fixed = cls.get_updated_hashes( + existing, update, missing_ok=missing_ok, overwrite=overwrite, sort=sort + ) + hash_path.write_text(os.linesep.join(fixed.lines), encoding="utf8") + + @classmethod + def get_updated_hashes( + cls, + existing: Mapping[PathLike, str], + update: Union[Callable[[Path], Optional[PathLike]], Mapping[PathLike, Optional[PathLike]]], + *, + missing_ok: bool = True, + overwrite: Optional[bool] = True, + sort: Union[bool, Callable[[Sequence[Path]], Sequence[Path]]] = False, + ) -> ChecksumMappingOpt: + """ + Returns updated hashes from a dir hash file. + See :meth:`update_dir_hashes`; this just returns values instead of reading and writing. + + Returns: + A Mapping from resolved Paths; to: hex-encoded digests or ``None`` to indicate removal. + Has a method :meth:`typedfs.checksums.ChecksumMappingOpt.lines`. + """ + existing = ChecksumMapping({Path(p).resolve(): h for p, h in existing.items()}) + fixed = {} + for p, v in existing.items(): + v_new = update(p) if callable(update) else update.get(p, v) + fixed[p] = cls._get_updated_hash( + path=p, + new=v_new, + existing=existing, + missing_ok=missing_ok, + overwrite=overwrite, + ) + if not callable(update): + for p, v in update.items(): + p = Path(p).resolve() + fixed[p] = cls._get_updated_hash( + path=p, + new=v, + existing=existing, + missing_ok=missing_ok, + overwrite=overwrite, + ) + if sort is True: + fixed = sorted(fixed) + elif callable(sort): + fixed = sort(fixed) + return ChecksumMappingOpt(fixed) + + @classmethod + def _get_updated_hash( + cls, + *, + path: Path, + new: Optional[str], + existing: Mapping[Path, str], + overwrite: Optional[bool], + missing_ok: bool, + ) -> Optional[str]: + path = path.resolve() + z = existing.get(path) + if z is None and not missing_ok: + raise HashFilenameMissingError(f"{path} not found ({len(existing)} are)") + if z is not None: + err = None + if overwrite is False and z == new: + err = ( + HashExistsError, + f"Hash for {path} already exists (and it matches)", + ) + if overwrite is False and z != new: + err = ( + HashExistsError, + f"Hash for {path} already exists (and it does not match)", + ) + if overwrite is None and z != new: + err = ( + HashContradictsExistingError, + f"Hash for {path} already exists but does not match", + ) + if err is not None: + raise err[0](err[1], key=str(path), original=z, new=new) + return new @classmethod def _verify_file_hash( diff --git a/typeddfs/cli_help.py b/typeddfs/cli_help.py index b68ba22..b0b8c68 100644 --- a/typeddfs/cli_help.py +++ b/typeddfs/cli_help.py @@ -12,7 +12,7 @@ from dataclasses import dataclass from typing import FrozenSet, Mapping, Sequence, Type -from typeddfs import TypedDf, MatrixDf +from typeddfs import MatrixDf, TypedDf # noinspection PyProtectedMember from typeddfs._utils import _FLEXWF_SEP, _HDF_KEY, _PICKLE_VR, _TOML_AOT