diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 411221c6..9fb10fa2 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -6,21 +6,24 @@ on: jobs: publish: + name: Upload release to PyPI runs-on: ubuntu-latest + environment: + name: production + url: https://pypi.org/project/pganonymize/ + permissions: + id-token: write # IMPORTANT: mandatory for trusted publishing steps: - - uses: actions/checkout@v2 - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: '3.10' - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install setuptools wheel twine - - name: Build and publish - env: - TWINE_USERNAME: __token__ - TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} - run: | - python setup.py sdist bdist_wheel - twine upload dist/* + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install setuptools wheel + - name: Build + run: python setup.py sdist bdist_wheel + - name: Publish package distributions to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 001aebab..72d367a7 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -3,49 +3,57 @@ name: Test on: [push] jobs: - linting: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 - with: - python-version: '3.10' - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install tox - - name: Lint - run: tox -e flake8 tests: - needs: linting runs-on: ubuntu-latest strategy: matrix: - python-version: ['2.7', '3.6', '3.7', '3.8', '3.9', '3.10'] - env: - PYTHON: ${{ matrix.python-version }} + image: + - 'python:2.7-buster' + - 'python:3.6-bullseye' + - 'python:3.7-bookworm' + - 'python:3.8-bookworm' + - 'python:3.9-bookworm' + - 'python:3.10-bookworm' + - 'python:3.11-bookworm' + - 'python:3.12-bookworm' + container: + image: ${{ matrix.image }} steps: - - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - id: setup-python - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies + - uses: actions/checkout@v4 + - name: Set environment variables run: | - python -m pip install --upgrade pip - pip install --use-pep517 tox "coverage<5" - - name: Run tests + echo "PYTHON=$(echo '${{ matrix.image }}' | sed -r 's/^python:([0-9]+)\.([0-9]+).*$/\1.\2/g')" >> $GITHUB_ENV + echo "TOXFACTOR=$(echo '${{ matrix.image }}' | sed -r 's/^python:([0-9]+)\.([0-9]+).*$/py\1\2/g')" >> $GITHUB_ENV + - name: Install psycopg2 requirements run: | - export TOXENV=$(echo "py${{ matrix.python-version }}" | sed 's/\.//g') - tox -- -p no:warnings + apt update + apt-get -y install python-dev + if: matrix.image == 'python:2.7-buster' + - name: Install psycopg2 requirements + run: | + apt update + apt-get -y install python3-dev + if: matrix.image != 'python:2.7-buster' + - name: Install importlib-metadata for older Python versions + run: pip install "importlib-metadata<3" + if: >- + matrix.image == 'python:2.7-buster' || + matrix.image == 'python:3.6-bullseye' || + matrix.image == 'python:3.7-bookworm' + - name: Install test utilities + run: pip install "tox<4" tox-factor "coverage<5" + - name: Lint with flake8 + run: tox -e flake8 + if: matrix.image == 'python:3.12-bookworm' + - name: Test via tox + run: tox -- -p no:warnings - name: Generate coverage report - run: coverage html + run: coverage xml if: ${{ success() }} - - name: Upload coverage data - uses: actions/upload-artifact@v3 + - name: Upload coverage report + uses: codecov/codecov-action@v4 with: - name: coverage - path: htmlcov - if-no-files-found: ignore + files: coverage.xml + flags: unittests + token: ${{ secrets.CODECOV_TOKEN }} if: ${{ success() }} diff --git a/.gitignore b/.gitignore index 5783e32d..7332cbaa 100644 --- a/.gitignore +++ b/.gitignore @@ -55,10 +55,11 @@ coverage.xml # Sphinx documentation docs/_build/ +docs/_api # PyBuilder target/ # IDEs .idea/ -.vscode/ \ No newline at end of file +.vscode/ diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 00000000..c5eadeac --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,118 @@ +# Changelog + +## Development + +## 0.12.0 (2024-07-18) + +* [#64 Fix typos](https://github.com/rheinwerk-verlag/pganonymize/pull/64) ([kianmeng](https://github.com/kianmeng)) +* [#62 Allow to configure parallel option](https://github.com/rheinwerk-verlag/pganonymize/pull/62) ([fblackburn1](https://github.com/fblackburn1)) +* [#58 Support pg dump password](https://github.com/rheinwerk-verlag/pganonymize/pull/58) ([fblackburn1](https://github.com/fblackburn1)) +* [#53 Python 2.7 tests are failing](https://github.com/rheinwerk-verlag/pganonymize/issues/53) +* [#56 Add Trusted Publisher Management workflow](https://github.com/rheinwerk-verlag/pganonymize/issues/56) + +## 0.11.0 (2024-02-29) + +* [#52 Add update_json provider](https://github.com/rheinwerk-verlag/pganonymize/pull/52) ([bobslee](https://github.com/bobslee)) + +## 0.10.0 (2022-11-29) + +* [#49 Configure psycopg2 to support UUID objects](https://github.com/rheinwerk-verlag/pganonymize/pull/49) +* [#48 Add support for localized "Faker" data](https://github.com/rheinwerk-verlag/pganonymize/pull/48) + +## 0.9.0 (2022-11-23) + +* [#46 Broken Python 2.7 compatibility](https://github.com/rheinwerk-verlag/pganonymize/pull/46) +* [#45 Add partial masked provider](https://github.com/rheinwerk-verlag/pganonymize/pull/45) ([Tilley](https://github.com/Tilley/)) +* [#44 Pass kwargs through to faker functions from schema](https://github.com/rheinwerk-verlag/pganonymize/pull/44)([Tilley](https://github.com/Tilley>)) + +## 0.8.0 (2022-03-15) + +* [#39 Renamed project to "pganonymize"](https://github.com/rheinwerk-verlag/pganonymize/issues/39) +* [#38 Allow environment variables in schema definition](https://github.com/rheinwerk-verlag/pganonymize/pull/38) ([nurikk](https://github.com/nurikk)) + +## 0.7.0 (2021-11-30) + +* [#34 Subprocess "run" being used on Python2.7](https://github.com/rheinwerk-verlag/pganonymize/issues/34) +* [#35 parmap no longer supports Python 2.7](https://github.com/rheinwerk-verlag/pganonymize/issues/35) + * Dropped Python 3.5 support + * Pinned libraries Python 2.7 +* [#32 Fixed pg_dump arguments](https://github.com/rheinwerk-verlag/pganonymize/pull/32) ([korsar182](https://github.com/korsar182)) +* Simplified provider registration (no metaclass usage anymore) + +## 0.6.1 (2021-07-13) + +* Added missing dependencies for the `setup.py` + +## 0.6.0 (2021-07-13) + +* [#28 Add json support](https://github.com/rheinwerk-verlag/pganonymize/pull/25) ([nurikk](https://github.com/nurikk)) +* [#27 Better anonymisation](https://github.com/rheinwerk-verlag/pganonymize/pull/25) ([nurikk](https://github.com/nurikk)) +* [#25 Remove column specification for `cursor.copy_from` call](https://github.com/rheinwerk-verlag/pganonymize/pull/25) ([nurikk](https://github.com/nurikk)) + +## 0.5.0 (2021-06-30) + +* [#22 Fix table and column name quotes in `cursor.copy_from` call](https://github.com/rheinwerk-verlag/pganonymize/pull/22) ([nurikk](https://github.com/nurikk)) +* [#23 Allow uniq faker](https://github.com/rheinwerk-verlag/pganonymize/pull/23) ([nurikk](https://github.com/nurikk)) + +## 0.4.1 (2021-05-27) + +* [#19 Make chunk size in the table definition dynamic](https://github.com/rheinwerk-verlag/pganonymize/pull/19) ([halilkaya](https://github.com/halilkaya)) + +## 0.4.0 (2021-05-05) + +* [#18 Specify (SQL WHERE) search_condition, to filter the table for rows to be anonymized](https://github.com/rheinwerk-verlag/pganonymize/pull/18) (`bobslee `_) +* [#17 Fix anonymizing error if there is a JSONB column in a table](https://github.com/rheinwerk-verlag/pganonymize/pull/17) ([koptelovav](https://github.com/koptelovav)) + +## 0.3.3 (2021-04-16) + +* [#16 Preserve column and table cases during the copy process](https://github.com/rheinwerk-verlag/pganonymize/issues/16) + +## 0.3.2 (2021-01-25) + +* [#15 Fix for exclude bug](https://github.com/rheinwerk-verlag/pganonymize/pull/15) ([abhinavvaidya90](https://github.com/abhinavvaidya90)) + +## 0.3.1 (2020-12-04) + +* [#13 Fixed a syntax error if no truncated tables are defined](https://github.com/rheinwerk-verlag/pganonymize/pull/13) ([ray-man](https://github.com/ray-man)) + +## 0.3.0 (2020-02-11) + +* Use [`python-poetry`](https://github.com/python-poetry/poetry) for requirements management +* Added commandline argument to list all available providers (#4) +* Added commandline argument to create a dump file (#5) +* Execute table truncation in one statement to avoid foreign key constraint errors (thanks to [W1ldPo1nter](https://github.com/W1ldPo1nter)) + +## 0.2.4 (2020-01-03) + +* Fixed several issues with the usage of ``dict.keys`` and Python 3 + +## 0.2.3 (2020-01-02) + +* Fixed the wrong cStringIO import for Python 3 +* Removed Travis-CI file in favor of the Github actions + +## 0.2.2 (2020-01-02) + +* Hide the progressbar completely if verbose is set to ``False`` +* Restructured the requirement files and added flake8 to Travis CI + +## 0.2.1 (2019-12-20) + +* Added field based, regular expression excludes (to skip data under certain conditions). + Currently only regular expressions are supported and the exclusion affects the whole row, + not just one single column. + +## 0.2.0 (2019-12-20) + +* Added provider classes +* Added new providers: + * choice - returns a random list element + * mask - replaces the original value with a static sign + +## 0.1.1 (2019-12-18) + +Changed setup.py + +## 0.1.0 (2019-12-16) + +Initial release of the prototype diff --git a/CHANGELOG.rst b/CHANGELOG.rst deleted file mode 100644 index 7f73dc57..00000000 --- a/CHANGELOG.rst +++ /dev/null @@ -1,133 +0,0 @@ -Changelog -========= - -Development ------------ - -0.11.0 (2024-02-29) -------------------- - -* `#52 `_: Add update_json provider (`bobslee `_) - -0.10.0 (2022-11-29) -------------------- - -* `#49 `_: Configure psycopg2 to support UUID objects -* `#48 `_: Add support for localized "Faker" data - -0.9.0 (2022-11-23) ------------------- - -* `#46 `_: Broken Python 2.7 compatibility -* `#45 `_: Add partial masked provider (`Tilley `_) -* `#44 `_: Pass kwargs through to faker functions from schema (`Tilley `_) - -0.8.0 (2022-03-15) ------------------- - -* `#39 `_: Renamed project to "pganonymize" -* `#38 `_: Allow environment variables in schema definition (`nurikk `_) - -0.7.0 (2021-11-30) ------------------- - -* `#34 `_: Subprocess "run" being used on Python2.7 -* `#35 `_: parmap no longer supports Python 2.7 - * Dropped Python 3.5 support - * Pinned libraries Python 2.7 -* `#32 `_: Fixed pg_dump arguments (`korsar182 `_) -* Simplified provider registration (no metaclass usage anymore) - -0.6.1 (2021-07-13) ------------------- - -* Added missing dependencies for the `setup.py` - -0.6.0 (2021-07-13) ------------------- - -* `#28 `_: Add json support (`nurikk `_) -* `#27 `_: Better anonymisation (`nurikk `_) -* `#25 `_: Remove column specification for `cursor.copy_from` call (`nurikk `_) - -0.5.0 (2021-06-30) ------------------- - -* `#22 `_: Fix table and column name quotes in `cursor.copy_from` call (`nurikk `_) -* `#23 `_: Allow uniq faker (`nurikk `_) - -0.4.1 (2021-05-27) ------------------- - -* `#19 `_: Make chunk size in the table definition dynamic (`halilkaya `_) - -0.4.0 (2021-05-05) ------------------- - -* `#18 `_: Specify (SQL WHERE) search_condition, to filter the table for rows to be anonymized (`bobslee `_) -* `#17 `_: Fix anonymizing error if there is a JSONB column in a table (`koptelovav `_) - -0.3.3 (2021-04-16) ------------------- - -* `#16 `_: Preserve column and table cases during the copy process - -0.3.2 (2021-01-25) ------------------- - -* `#15 `_: Fix for exclude bug (`abhinavvaidya90 `_) - -0.3.1 (2020-12-04) ------------------- - -* `#13 `_: Fixed a syntax error if no truncated tables are defined (`ray-man `_) - -0.3.0 (2020-02-11) ------------------- - -* Use `python-poetry `_ for requirements management -* Added commandline argument to list all available providers (#4) -* Added commandline argument to create a dump file (#5) -* Execute table truncation in one statement to avoid foreign key constraint errors (thanks to `W1ldPo1nter `_) - -0.2.4 (2020-01-03) ------------------- - -* Fixed several issues with the usage of ``dict.keys`` and Python 3 - -0.2.3 (2020-01-02) ------------------- - -* Fixed the wrong cStringIO import for Python 3 -* Removed Travis-CI file in favor of the Github actions - -0.2.2 (2020-01-02) ------------------- - -* Hide the progressbar completely if verbose is set to ``False`` -* Restructured the requirement files and added flake8 to Travis CI - -0.2.1 (2019-12-20) ------------------- - -* Added field based, regular expression excludes (to skip data under certain conditions). - Currently only regular expressions are supported and the exclusion affects the whole row, - not just one single column. - -0.2.0 (2019-12-20) ------------------- - -* Added provider classes -* Added new providers: - * choice - returns a random list element - * mask - replaces the original value with a static sign - -0.1.1 (2019-12-18) ------------------- - -Changed setup.py - -0.1.0 (2019-12-16) ------------------- - -Initial release of the prototype diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..97b11e41 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,42 @@ +# Contributing to PostgreSQL Anonymizer + +First of all: thanks for your interest in this project and taking the time to contribute. + +The following document is a small set of guidelines for contributing to this project. They are guidelines and no rules. + +## Reporting bugs + +If you have found a bug, please check the project's +[issue](https://github.com/rheinwerk-verlag/pganonymize/issues) page first and feel free to create a +[new issue](https://github.com/rheinwerk-verlag/pganonymize/issues/new), if no one else has reported it yet. + +## Making changes + +Create a fork if you want to make changes or clone the repo if you want a readonly access to the current development +version: + +```bash +$ git clone git@github.com:rheinwerk-verlag/pganonymize.git +$ cd pganonymize +``` + +For the development use a virtualenv or install the requirements directly: + +```bash +$ sudo pip install -r requirements.txt +``` + +## Coding style + +We have created an [EditorConfig](https://editorconfig.org/) file for this project that should be usable for most IDEs. +Otherwise please make sure to adhere to the specifications from the config file. + +## Creating a pull request + +Before creating a pull request make sure to check: + +* existing docstrings have been updated +* new code has valid docstrings +* whether existing [tests](https://github.com/rheinwerk-verlag/pganonymize/tree/development/tests) have to be fixed +* new tests have to be written first +* the documentation (in particular the [Sphinx documentation](https://github.com/rheinwerk-verlag/pganonymize/tree/development/docs)) has to be modified diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst deleted file mode 100644 index ed67cb9b..00000000 --- a/CONTRIBUTING.rst +++ /dev/null @@ -1,52 +0,0 @@ -Contributing to PostgreSQL Anonymizer -===================================== - -First of all: thanks for your interest in this project and taking the time to contribute. - -The following document is a small set of guidelines for contributing to this project. They are guidelines and no rules. - -Reporting bugs ---------------- - -If you have found a bug, please check the project's `issue`_ page first and feel free to create a `new issue`_, if no -one else has reported it yet. - -Making changes --------------- - -Create a fork if you want to make changes or clone the repo if you want a readonly access to the current development -version: - -.. code-block:: bash - - $ git clone git@github.com:rheinwerk-verlag/postgresql-anonymizer.git - $ cd postgresql-anonymizer - -For the development use a virtualenv or install the requirements directly: - -.. code-block:: bash - - $ sudo pip install -r requirements.txt - -Coding style ------------- - -We have created an `EditorConfig`_ file for this project that should be usable for most IDEs. Otherwise please make -sure to adhere to the specifications from the config file. - -Creating a pull request ------------------------ - -Before creating a pull request make sure to check: - -* existing docstrings have been updated -* new code has valid docstrings -* whether existing `tests`_ have to be fixed -* new tests have to be written first -* the documentation (in particular the `Sphinx documentation`_) has to be modified - -.. _issue: https://github.com/rheinwerk-verlag/postgresql-anonymizer/issues -.. _new issue: https://github.com/rheinwerk-verlag/postgresql-anonymizer/issues/new -.. _EditorConfig: https://editorconfig.org/ -.. _tests: https://github.com/rheinwerk-verlag/postgresql-anonymizer/tree/development/tests -.. _Sphinx documentation: https://github.com/rheinwerk-verlag/postgresql-anonymizer/tree/development/docs diff --git a/Dockerfile b/Dockerfile index 40ffebaa..e78fe78f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,9 +1,9 @@ -FROM python:3.8.1-slim +FROM python:3.12.2-slim LABEL maintainer="webteam@rheinwerk-verlag.de" RUN apt-get update -y \ - && apt-get upgrade -y \ + && apt-get upgrade -y \ && apt-get install -y libpq-dev python3-pip \ && pip install -U pip \ && pip install pganonymize psycopg2-binary \ diff --git a/LICENSE.rst b/LICENSE.rst index ad18bfc8..a37e625d 100644 --- a/LICENSE.rst +++ b/LICENSE.rst @@ -3,7 +3,7 @@ License The MIT License -Copyright (c) 2019-2021, Rheinwerk Verlag GmbH +Copyright (c) 2019-2024, Rheinwerk Verlag GmbH Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/Makefile b/Makefile index eebb1a36..f7fbf8de 100644 --- a/Makefile +++ b/Makefile @@ -41,24 +41,61 @@ clean-pyc: ## remove Python file artifacts clean-test: ## remove test and coverage artifacts rm -fr .tox/ + rm -f .coverage + rm -fr reports/ + +test: ## run tests quickly with the default Python + python setup.py test + +test-all: ## run tests on every Python version with tox + tox + +pylint: ## run style checks and static analysis with pylint + @-mkdir -p reports/ + @-pylint $(PYTHON_PACKAGE) -r n --msg-template="{path}:{line}: [{msg_id}({symbol}), {obj}] {msg}" > reports/pylint.txt + @echo "See reports/pylint.txt" + @-pylint $(PYTHON_PACKAGE) flake8: ## run style checks and static analysis with flake8 - @flake8 + @-mkdir -p reports/ + flake8 $(PYTHON_PACKAGE) $(TESTS_PACKAGE) --format='%(path)s:%(row)d: [%(code)s(%(code)s), ] %(text)s' --output-file=reports/flake8.txt --tee + +docstrings: ## check docstring presence and style conventions with pydocstyle + pydocstyle $(PYTHON_PACKAGE) + +lint: flake8 docstrings pylint + +coverage: ## check code coverage quickly with the default Python + py.test --cov-report html:reports/htmlcov --cov-report xml:reports/coverage.xml + @echo "See reports/htmlcov/index.html" + +metrics: ## print code metrics with radon + radon raw -s $(PYTHON_PACKAGE) $(TEST_PACKAGE) + radon cc -s $(PYTHON_PACKAGE) $(TEST_PACKAGE) + radon mi -s $(PYTHON_PACKAGE) $(TEST_PACKAGE) + +docs: ## generate Sphinx HTML documentation, including API docs + @if python -c 'import sys; sys.exit(sys.version_info[0]<3)'; then \ + rm -rf docs/_api; \ + sphinx-apidoc --no-toc -o docs/_api $(PYTHON_PACKAGE) "**/tests" "**/migrations" "**/south_migrations"; \ + $(MAKE) -C docs clean; \ + $(MAKE) -C docs html; \ + echo "See docs/_build/html/index.html"; \ + else \ + echo "Please build the docs using Python 3."; \ + fi + +docs-open: + $(BROWSER) docs/_build/html/index.html + +docs-all: docs docs-open release: clean ## package and upload a release - python setup.py sdist upload - python setup.py bdist_wheel upload + python setup.py release upload dist: clean ## builds source and wheel package - python setup.py sdist - python setup.py bdist_wheel + python setup.py release ls -l dist install: clean ## install the package to the active Python's site-packages python setup.py install - -test: - @pytest - -test-all: ## run tests on every Python version with tox - @tox diff --git a/README.rst b/README.rst index d06acd2e..c2427da4 100644 --- a/README.rst +++ b/README.rst @@ -8,9 +8,10 @@ anonymization. The tool requires a direct PostgreSQL connection to perform the a .. class:: no-web no-pdf - |python| |license| |pypi| |downloads| |build| |health| + |python| |license| |pypi| |downloads| |build| |codecov| |health| -.. image:: docs/_static/demo.gif +.. figure:: https://raw.githubusercontent.com/rheinwerk-verlag/pganonymize/main/docs/_static/demo.gif + :width: 100% .. contents:: @@ -84,8 +85,11 @@ Usage --port PORT Port of the database --dry-run Don't commit changes made on the database --dump-file DUMP_FILE - Create a database dump file with the given name + Create a database dump file with the given name + --dump-options DUMP_OPTIONS + Options to pass to the pg_dump command --init-sql INIT_SQL SQL to run before starting anonymization + --parallel Data anonymization is done in parallel Despite the database connection values, you will have to define a YAML schema file, that includes all anonymization rules for that database. Take a look at the `schema documentation`_ or the @@ -134,6 +138,23 @@ Example call: --dump-file=/tmp/dump.gz \ -v +So that the password for dumping does not have to be entered manually, it can also be entered as an environment var +``PGPASSWORD``: + +.. code-block:: + + $ PGPASSWORD=password pganonymize --schema=myschema.yml \ + --dbname=test_database \ + --user=username \ + --password=mysecret \ + --host=db.host.example.com \ + --dump-file=/tmp/dump.gz \ + -v + +.. warning:: + + Currently only the ``dump-file`` operation supports environment variables. + Docker ~~~~~~ @@ -161,7 +182,7 @@ After that you can pass a schema file to the container, using Docker volumes, an .. _uuid4: https://www.postgresql.org/docs/current/datatype-uuid.html .. _documentation: https://pganonymize.readthedocs.io/en/latest/ -.. _schema documentation: https://python-postgresql-anonymizer.readthedocs.io/en/latest/schema.html +.. _schema documentation: https://pganonymize.readthedocs.io/en/latest/schema.html .. _YAML sample schema: https://github.com/rheinwerk-verlag/pganonymize/blob/master/sample_schema.yml .. |python| image:: https://img.shields.io/pypi/pyversions/pganonymize @@ -177,10 +198,13 @@ After that you can pass a schema file to the container, using Docker volumes, an :target: https://pepy.tech/project/pganonymize :alt: Download count -.. |build| image:: https://github.com/rheinwerk-verlag/postgresql-anonymizer/workflows/Test/badge.svg +.. |build| image:: https://github.com/rheinwerk-verlag/pganonymize/actions/workflows/test.yml/badge.svg :target: https://github.com/rheinwerk-verlag/pganonymize/actions +.. |codecov| image:: https://codecov.io/gh/rheinwerk-verlag/pganonymize/branch/main/graph/badge.svg + :target: https://codecov.io/gh/rheinwerk-verlag/pganonymize + .. |health| image:: https://snyk.io/advisor/python/pganonymize/badge.svg - :target: https://snyk.io/advisor/python/pganonymize - :alt: pganonymize + :target: https://snyk.io/advisor/python/pganonymize + :alt: pganonymize diff --git a/docs/api.rst b/docs/api.rst index 4ac86b45..22f0faa6 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -4,4 +4,4 @@ API .. toctree:: :maxdepth: 4 - pganonymize + _api/pganonymize diff --git a/docs/changelog.md b/docs/changelog.md new file mode 120000 index 00000000..04c99a55 --- /dev/null +++ b/docs/changelog.md @@ -0,0 +1 @@ +../CHANGELOG.md \ No newline at end of file diff --git a/docs/changelog.rst b/docs/changelog.rst deleted file mode 100644 index 565b0521..00000000 --- a/docs/changelog.rst +++ /dev/null @@ -1 +0,0 @@ -.. include:: ../CHANGELOG.rst diff --git a/docs/conf.py b/docs/conf.py index cf7f31dc..4e1da000 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -40,7 +40,7 @@ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx', 'sphinx.ext.viewcode'] +extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx', 'sphinx.ext.viewcode', 'myst_parser'] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] @@ -48,7 +48,7 @@ #source_parsers = {} # The suffix of source filenames. -source_suffix = ['.rst'] +source_suffix = ['.rst', '.md'] # The encoding of source files. #source_encoding = 'utf-8-sig' @@ -58,7 +58,7 @@ # General information about the project. project = u'pganonymize' -copyright = u'2019, Rheinwerk Verlag GmbH, Henning Kage' +copyright = u'2019-2024, Rheinwerk Verlag GmbH, Henning Kage' # The version info for the project you're documenting, acts as replacement # for |version| and |release|, also used in various other places throughout diff --git a/docs/pganonymize.rst b/docs/pganonymize.rst deleted file mode 100644 index f460a561..00000000 --- a/docs/pganonymize.rst +++ /dev/null @@ -1,62 +0,0 @@ -pganonymize package -==================== - -Submodules ----------- - -pganonymize.cli module ------------------------ - -.. automodule:: pganonymize.cli - :members: - :undoc-members: - :show-inheritance: - -pganonymize.constants module ------------------------------ - -.. automodule:: pganonymize.constants - :members: - :undoc-members: - :show-inheritance: - -pganonymize.exceptions module ------------------------------- - -.. automodule:: pganonymize.exceptions - :members: - :undoc-members: - :show-inheritance: - -pganonymize.providers module ------------------------------ - -.. automodule:: pganonymize.providers - :members: - :undoc-members: - :show-inheritance: - -pganonymize.utils module -------------------------- - -.. automodule:: pganonymize.utils - :members: - :undoc-members: - :show-inheritance: - -pganonymize.version module ---------------------------- - -.. automodule:: pganonymize.version - :members: - :undoc-members: - :show-inheritance: - - -Module contents ---------------- - -.. automodule:: pganonymize - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/schema.rst b/docs/schema.rst index 866a44e7..b8a42806 100644 --- a/docs/schema.rst +++ b/docs/schema.rst @@ -82,7 +82,7 @@ anonymization process. Each field entry has its own ``provider`` that defines ho - email: provider: name: md5 - append: @localhost + append: "@localhost" ``excludes`` ~~~~~~~~~~~~ diff --git a/pganonymize/cli.py b/pganonymize/cli.py index 4a85c6ba..9d035b07 100644 --- a/pganonymize/cli.py +++ b/pganonymize/cli.py @@ -6,7 +6,7 @@ import logging import time -from pganonymize.config import config +from pganonymize.config import config, validate_args_with_config from pganonymize.constants import DATABASE_ARGS, DEFAULT_SCHEMA_FILE from pganonymize.providers import provider_registry from pganonymize.utils import anonymize_tables, create_database_dump, get_connection, truncate_tables @@ -46,7 +46,18 @@ def get_arg_parser(): parser.add_argument('--dry-run', action='store_true', help='Don\'t commit changes made on the database', default=False) parser.add_argument('--dump-file', help='Create a database dump file with the given name') + parser.add_argument('--dump-options', help='Options to pass to the pg_dump command', + default='--format custom --compress 9') parser.add_argument('--init-sql', help='SQL to run before starting anonymization', default=False) + parser.add_argument( + '--parallel', + action='store_true', + help=( + 'Parallelize anonymization of value.' + 'WARNING: `fake.unique.*` providers are not compatible with this option' + ), + default=False, + ) return parser @@ -65,6 +76,8 @@ def main(args): config.schema_file = args.schema + validate_args_with_config(args, config) + pg_args = get_pg_args(args) connection = get_connection(pg_args) if args.init_sql: @@ -75,7 +88,12 @@ def main(args): start_time = time.time() truncate_tables(connection) - anonymize_tables(connection, verbose=args.verbose, dry_run=args.dry_run) + anonymize_tables( + connection, + verbose=args.verbose, + dry_run=args.dry_run, + parallel=args.parallel, + ) if not args.dry_run: connection.commit() @@ -85,4 +103,4 @@ def main(args): logging.info('Anonymization took {:.2f}s'.format(end_time - start_time)) if args.dump_file: - create_database_dump(args.dump_file, pg_args) + create_database_dump(args.dump_file, pg_args, args.dump_options) diff --git a/pganonymize/config.py b/pganonymize/config.py index 08799960..921bff31 100644 --- a/pganonymize/config.py +++ b/pganonymize/config.py @@ -2,6 +2,7 @@ import re import yaml +from pganonymize.exceptions import InvalidConfiguration class Config(object): @@ -55,3 +56,14 @@ def constructor_env_variables(loader, node): config = Config() + + +def validate_args_with_config(args, config): + definitions = config.schema.get('tables', []) + for definition in definitions: + table_definition = list(definition.values())[0] + columns = table_definition.get('fields', []) + for column in columns: + column_config = list(column.values())[0] + if args.parallel and column_config['provider']['name'].startswith('fake.unique'): + raise InvalidConfiguration('`--parallel` option and `fake.unique.*` providers are incompatible') diff --git a/pganonymize/exceptions.py b/pganonymize/exceptions.py index bd893c47..1f118d9e 100644 --- a/pganonymize/exceptions.py +++ b/pganonymize/exceptions.py @@ -20,3 +20,7 @@ class ProviderAlreadyRegistered(PgAnonymizeException): class BadDataFormat(PgAnonymizeException): """Raised if the anonymized data cannot be copied.""" + + +class InvalidConfiguration(PgAnonymizeException): + """Raised if configuration is invalid.""" diff --git a/pganonymize/utils.py b/pganonymize/utils.py index 11be694a..b9fca23e 100644 --- a/pganonymize/utils.py +++ b/pganonymize/utils.py @@ -24,13 +24,14 @@ psycopg2.extras.register_uuid() -def anonymize_tables(connection, verbose=False, dry_run=False): +def anonymize_tables(connection, verbose=False, dry_run=False, parallel=False): """ Anonymize a list of tables according to the schema definition. :param connection: A database connection instance. :param bool verbose: Display logging information and a progress bar. :param bool dry_run: Script is running in dry-run mode, no commit expected. + :param bool parallel: Data anonymization is done in parallel. """ definitions = config.schema.get('tables', []) for definition in definitions: @@ -44,8 +45,19 @@ def anonymize_tables(connection, verbose=False, dry_run=False): primary_key = table_definition.get('primary_key', DEFAULT_PRIMARY_KEY) total_count = get_table_count(connection, table_name, dry_run) chunk_size = table_definition.get('chunk_size', DEFAULT_CHUNK_SIZE) - build_and_then_import_data(connection, table_name, primary_key, columns, excludes, - search, total_count, chunk_size, verbose=verbose, dry_run=dry_run) + build_and_then_import_data( + connection, + table_name, + primary_key, + columns, + excludes, + search, + total_count, + chunk_size, + verbose=verbose, + dry_run=dry_run, + parallel=parallel, + ) end_time = time.time() logging.info('{} anonymization took {:.2f}s'.format(table_name, end_time - start_time)) @@ -63,8 +75,19 @@ def process_row(row, columns, excludes): return row -def build_and_then_import_data(connection, table, primary_key, columns, - excludes, search, total_count, chunk_size, verbose=False, dry_run=False): +def build_and_then_import_data( + connection, + table, + primary_key, + columns, + excludes, + search, + total_count, + chunk_size, + verbose=False, + dry_run=False, + parallel=False, +): """ Select all data from a table and return it together with a list of table columns. @@ -78,6 +101,7 @@ def build_and_then_import_data(connection, table, primary_key, columns, :param int chunk_size: Number of data rows to fetch with the cursor :param bool verbose: Display logging information and a progress bar. :param bool dry_run: Script is running in dry-run mode, no commit expected. + :param bool parallel: Data anonymization is done in parallel. """ column_names = get_column_names(columns) sql_columns = SQL(', ').join([Identifier(column_name) for column_name in [primary_key] + column_names]) @@ -95,7 +119,7 @@ def build_and_then_import_data(connection, table, primary_key, columns, for i in trange(batches, desc="Processing {} batches for {}".format(batches, table), disable=not verbose): records = cursor.fetchmany(size=chunk_size) if records: - data = parmap.map(process_row, records, columns, excludes, pm_pbar=verbose) + data = parmap.map(process_row, records, columns, excludes, pm_pbar=verbose, pm_parallel=parallel) import_data(connection, temp_table, [primary_key] + column_names, filter(None, data)) apply_anonymized_data(connection, temp_table, table, primary_key, columns) @@ -264,17 +288,22 @@ def truncate_tables(connection): cursor.close() -def create_database_dump(filename, db_args): +def create_database_dump(filename, db_args, dump_args): """ Create a dump file from the current database. :param str filename: Path to the dumpfile that should be created :param dict db_args: A dictionary with database related information """ - arguments = '-d {dbname} -U {user} -h {host} -p {port}'.format(**db_args) - cmd = 'pg_dump -Fc -Z 9 {args} -f {filename}'.format( - args=arguments, - filename=filename + env_vars = '' + if db_args.get('password'): + env_vars += 'PGPASSWORD={password}'.format(password=db_args['password']) + arguments = '--dbname {dbname} --username {user} --host {host} --port {port}'.format(**db_args) + cmd = '{env_vars}pg_dump {dump_args} {db_args} --file {filename}'.format( + env_vars='{} '.format(env_vars) if env_vars else '', + dump_args=dump_args, + db_args=arguments, + filename=filename, ) logging.info('Creating database dump file "%s"', filename) subprocess.call(cmd, shell=True) @@ -350,7 +379,7 @@ def nested_set(dic, path, value, delimiter='.'): Set dictionary value by path. :param dict dic: The source dictionary - :param str path: The path withing dictionary + :param str path: The path within dictionary :param value: The value to be set :param str delimiter: The path delimiter """ diff --git a/pganonymize/version.py b/pganonymize/version.py index 4cad2e2a..803770eb 100644 --- a/pganonymize/version.py +++ b/pganonymize/version.py @@ -1,3 +1,3 @@ # -*- coding: utf-8 -*- -__version__ = '0.11.0' +__version__ = '0.12.0' diff --git a/readthedocs.yaml b/readthedocs.yaml new file mode 100644 index 00000000..8001c37c --- /dev/null +++ b/readthedocs.yaml @@ -0,0 +1,15 @@ +version: 2 + +sphinx: + configuration: docs/conf.py + +formats: all + +build: + os: "ubuntu-20.04" + tools: + python: "3.8" + +python: + install: + - requirements: requirements.txt diff --git a/requirements-tox.txt b/requirements-tox.txt index 4aca0a6b..d5c46732 100644 --- a/requirements-tox.txt +++ b/requirements-tox.txt @@ -1,5 +1,6 @@ coverage==4.5.4 mock==3.0.5 pytest-cov==2.8.1 -pytest-pythonpath==0.7.3 -six==1.12.0 +pytest-pythonpath==0.7.4 +six==1.16.0 +tqdm diff --git a/requirements.txt b/requirements.txt index 3bd05690..652745bd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,10 @@ -r requirements-tox.txt flake8==3.9.2 isort==5.10.1 +myst-parser +parmap pgcopy>=1.5.0 pytest==6.2.5 -Sphinx==4.3.0 -sphinx-rtd-theme==1.0.0 +Sphinx==7.2.6 +sphinx-rtd-theme==2.0.0 tqdm>=4.61.1 diff --git a/setup.py b/setup.py index b83774ee..4b807529 100755 --- a/setup.py +++ b/setup.py @@ -46,6 +46,7 @@ def run(self): 'parmap; python_version>="3.6"', 'parmap==1.5.2; python_version<"3.6"', 'pgcopy', + 'pgcopy>=1.5,<1.6; python_version<"3.6"', 'psycopg2', 'psycopg2>=2.8.4,<2.9; python_version<"3.6"', 'pyyaml', @@ -91,6 +92,8 @@ def run(self): 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', + 'Programming Language :: Python :: 3.12', 'Topic :: Database' ], packages=find_packages(include=['pganonymize*']), diff --git a/tests/test_cli.py b/tests/test_cli.py index 616b6b5c..b1c462db 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -17,7 +17,7 @@ class TestCli(object): @pytest.mark.parametrize('cli_args, expected, expected_executes, commit_calls, call_dump', [ ['--host localhost --port 5432 --user root --password my-cool-password --dbname db --schema ./tests/schemes/valid_schema.yml -v --init-sql "set work_mem=\'1GB\'"', # noqa Namespace(verbose=1, list_providers=False, schema='./tests/schemes/valid_schema.yml', dbname='db', user='root', - password='my-cool-password', host='localhost', port='5432', dry_run=False, dump_file=None, init_sql="set work_mem='1GB'"), # noqa + password='my-cool-password', host='localhost', port='5432', dry_run=False, dump_file=None, dump_options='--format custom --compress 9', init_sql="set work_mem='1GB'", parallel=False), # noqa [call("set work_mem='1GB'"), call('TRUNCATE TABLE "django_session"'), call('SELECT COUNT(*) FROM "auth_user"'), @@ -32,7 +32,7 @@ class TestCli(object): ], ['--dry-run --host localhost --port 5432 --user root --password my-cool-password --dbname db --schema ./tests/schemes/valid_schema.yml -v --init-sql "set work_mem=\'1GB\'"', # noqa Namespace(verbose=1, list_providers=False, schema='./tests/schemes/valid_schema.yml', dbname='db', user='root', - password='my-cool-password', host='localhost', port='5432', dry_run=True, dump_file=None, init_sql="set work_mem='1GB'"), # noqa + password='my-cool-password', host='localhost', port='5432', dry_run=True, dump_file=None, dump_options='--format custom --compress 9', init_sql="set work_mem='1GB'", parallel=False), # noqa [call("set work_mem='1GB'"), call('TRUNCATE TABLE "django_session"'), call('SELECT "id", "first_name", "last_name", "email" FROM "auth_user" LIMIT 100'), @@ -42,9 +42,9 @@ class TestCli(object): ], 0, [] ], - ['--dump-file ./dump.sql --host localhost --port 5432 --user root --password my-cool-password --dbname db --schema ./tests/schemes/valid_schema.yml -v --init-sql "set work_mem=\'1GB\'"', # noqa + ['--dump-file ./dump.sql --dump-options "--format plain" --host localhost --port 5432 --user root --password my-cool-password --dbname db --schema ./tests/schemes/valid_schema.yml -v --init-sql "set work_mem=\'1GB\'"', # noqa Namespace(verbose=1, list_providers=False, schema='./tests/schemes/valid_schema.yml', dbname='db', user='root', - password='my-cool-password', host='localhost', port='5432', dry_run=False, dump_file='./dump.sql', init_sql="set work_mem='1GB'"), # noqa + password='my-cool-password', host='localhost', port='5432', dry_run=False, dump_file='./dump.sql', dump_options='--format plain', init_sql="set work_mem='1GB'", parallel=False), # noqa [ call("set work_mem='1GB'"), call('TRUNCATE TABLE "django_session"'), @@ -56,14 +56,14 @@ class TestCli(object): call('UPDATE "auth_user" t SET "first_name" = s."first_name", "last_name" = s."last_name", "email" = s."email" FROM "tmp_auth_user" s WHERE t."id" = s."id"') # noqa ], 1, - [call('pg_dump -Fc -Z 9 -d db -U root -h localhost -p 5432 -f ./dump.sql', shell=True)] + [call('PGPASSWORD=my-cool-password pg_dump --format plain --dbname db --username root --host localhost --port 5432 --file ./dump.sql', shell=True)] # noqa ], - ['--list-providers', + ['--list-providers --parallel', Namespace(verbose=None, list_providers=True, schema='schema.yml', dbname=None, user=None, - password='', host='localhost', port='5432', dry_run=False, dump_file=None, init_sql=False), + password='', host='localhost', port='5432', dry_run=False, dump_file=None, dump_options='--format custom --compress 9', init_sql=False, parallel=True), # noqa [], 0, [] - ] + ], ]) def test_cli_args(self, subprocess, patched_connect, quote_ident, cli_args, expected, expected_executes, commit_calls, call_dump): # noqa arg_parser = get_arg_parser() diff --git a/tests/test_config.py b/tests/test_config.py index 2b6485db..5d48816e 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,9 +1,10 @@ import os import pytest -from mock import patch +from mock import patch, Mock -from pganonymize.config import load_schema +from pganonymize.config import load_schema, validate_args_with_config +from pganonymize.exceptions import InvalidConfiguration @pytest.mark.parametrize('file, envs, expected', [ @@ -59,3 +60,38 @@ def test_load_schema(file, envs, expected): with patch.dict(os.environ, envs): assert load_schema(file) == expected + + +def test_validate_args_with_config_when_valid(): + args = Mock(parallel=False) + schema = { + 'tables': [ + { + 'table_name': { + 'fields': [ + {'column_name': {'provider': {'name': 'fake.unique.pystr'}}} + ] + } + } + ] + } + config = Mock(schema=schema) + validate_args_with_config(args, config) + + +def test_validate_args_with_config_when_invalid(): + args = Mock(parallel=True) + schema = { + 'tables': [ + { + 'table_name': { + 'fields': [ + {'column_name': {'provider': {'name': 'fake.unique.pystr'}}} + ] + } + } + ] + } + config = Mock(schema=schema) + with pytest.raises(InvalidConfiguration): + validate_args_with_config(args, config) diff --git a/tests/test_providers.py b/tests/test_providers.py index 44843166..0a2db9be 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -142,7 +142,7 @@ def test_alter_value_with_locale(self, mock_faker): providers.FakeProvider.alter_value('Foo', name='fake.date_of_birth', locale='de_DE') assert mock_faker['de_DE'].date_of_birth.call_count == 1 - def test_alter_value_with_unkown_locale(self): + def test_alter_value_with_unknown_locale(self): with pytest.raises(InvalidProviderArgument): providers.FakeProvider.alter_value('Foo', name='fake.date_of_birth', locale='de_DE') diff --git a/tests/test_utils.py b/tests/test_utils.py index ac5cdc50..03f6d4a7 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -236,6 +236,33 @@ class TestCreateDatabaseDump(object): @patch('pganonymize.utils.subprocess.call') def test(self, mock_call): - create_database_dump('/tmp/dump.gz', {'dbname': 'database', 'user': 'foo', 'host': 'localhost', 'port': 5432}) - mock_call.assert_called_once_with('pg_dump -Fc -Z 9 -d database -U foo -h localhost -p 5432 -f /tmp/dump.gz', - shell=True) + filename = '/tmp/dump.gz' + db_args = {'dbname': 'database', 'user': 'foo', 'host': 'localhost', 'port': 5432} + dump_args = '--format custom --compress 9' + create_database_dump(filename, db_args, dump_args) + mock_call.assert_called_once_with( + 'pg_dump --format custom --compress 9 --dbname database --username foo --host localhost --port 5432 --file /tmp/dump.gz', # noqa + shell=True, + ) + + @patch('pganonymize.utils.subprocess.call') + def test_with_password(self, mock_call): + filename = '/tmp/dump.gz' + db_args = {'dbname': 'database', 'user': 'foo', 'host': 'localhost', 'port': 5432, 'password': 'pass'} + dump_args = '--format custom --compress 9' + create_database_dump(filename, db_args, dump_args) + mock_call.assert_called_once_with( + 'PGPASSWORD=pass pg_dump --format custom --compress 9 --dbname database --username foo --host localhost --port 5432 --file /tmp/dump.gz', # noqa + shell=True, + ) + + @patch('pganonymize.utils.subprocess.call') + def test_with_custom_dump_args(self, mock_call): + filename = '/tmp/dump.gz' + db_args = {'dbname': 'database', 'user': 'foo', 'host': 'localhost', 'port': 5432} + dump_args = '--format plain' + create_database_dump(filename, db_args, dump_args) + mock_call.assert_called_once_with( + 'pg_dump --format plain --dbname database --username foo --host localhost --port 5432 --file /tmp/dump.gz', # noqa + shell=True, + ) diff --git a/tox.ini b/tox.ini index 167f7432..4896c701 100644 --- a/tox.ini +++ b/tox.ini @@ -1,32 +1,48 @@ [tox] -envlist = flake8,py27,py36,py37,py38,py39,py310 +requires = virtualenv<20.22.0 +envlist = + flake8 + py{27,36,37,38,39,310,311,312} [testenv:flake8] deps = flake8 commands = flake8 {toxinidir}/pganonymize {toxinidir}/tests [testenv] +skip_install = True setenv = PYTHONPATH = {toxinidir} deps = # faker py27: faker<4 - py{36,37,38,39,310}: faker>=9.9.0 + py{36,37,38,39,310,311,312}: faker>=9.9.0 # parmap py27: parmap==1.5.2 - py{36,37,38,39,310}: parmap>=1.5.2 - # psycopg2-binary - py27: psycopg2-binary==2.8.4 - py{36,37,38,39,310}: psycopg2-binary>=2.9.2 + py{36,37,38,39,310,311,312}: parmap>=1.5.2 # pyyaml py27: pyyaml<6 - py{36,37,38,39,310}: pyyaml>=6 + py{36,37,38,39,310,311,312}: pyyaml>=6 # pytest py27: pytest==4.0.2 py27: attrs<19.2 py27: more-itertools<8.11 - py{36,37,38,39,310}: pytest==6.2.5 + py{36,37}: pytest==6.2.5 + py{38,39,310,311,312}: pytest + # pgcopy + py27: pgcopy==1.5.0 + py{36,37,38,39,310,311,312}: pgcopy + # psycopg2 + py27: psycopg2 + py{36,37,38,39,310,311,312}: psycopg2>=2.9.2 # Common requirements -r{toxinidir}/requirements-tox.txt commands = py.test --basetemp={envtmpdir} {posargs} +passenv = + TOXENV + GITHUB_* + +[flake8] +exclude = docs,.tox,.git,.eggs +max-line-length = 120 +ignore = E731,W504