From 14228ee85b10df1f0b60391b187bba58513eeba7 Mon Sep 17 00:00:00 2001 From: Henning Kage Date: Wed, 1 Dec 2021 17:07:46 +0100 Subject: [PATCH 01/12] Added publish action --- .github/workflows/publish.yml | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 .github/workflows/publish.yml diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..97c5cba --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,26 @@ +name: Release + +on: + release: + types: [created] + +jobs: + publish: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.x' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install setuptools wheel twine + - name: Build and publish + env: + TWINE_USERNAME: __token__ + TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} + run: | + python setup.py sdist bdist_wheel + twine upload dist/* From ced0d88a2d7dfa09da00995d732a6fef79040c86 Mon Sep 17 00:00:00 2001 From: nurikk Date: Tue, 7 Dec 2021 11:25:49 +0800 Subject: [PATCH 02/12] Allow environment variables in schema definition --- docs/schema.rst | 26 +++++++++++++ pganonymizer/cli.py | 6 +-- pganonymizer/utils.py | 34 ++++++++++++++++- tests/schemes/schema_with_env_variables.yml | 13 +++++++ tests/test_utils.py | 42 ++++++++++++++++++++- 5 files changed, 115 insertions(+), 6 deletions(-) create mode 100644 tests/schemes/schema_with_env_variables.yml diff --git a/docs/schema.rst b/docs/schema.rst index ee841db..1706013 100644 --- a/docs/schema.rst +++ b/docs/schema.rst @@ -114,6 +114,32 @@ This is useful if you need to anonymize one or more specific records, eg for "Ri provider: name: clear +YAML schema file supports placeholders with environment variables, ex: + +`!ENV ${HOST}`` + +`!ENV '/var/${LOG_PATH}'` + +So you can construct dynamic filter conditions like: +.. code-block:: sh + $ export COMPANY_ID=123 + + $ export ACTION_TO_BE_TAKEN=clear + + $ pganonymize + + +***Example**:: + + - login: + search: id = '!ENV ${COMPANY_ID}' + search2: id = ${COMPANY_ID} + search3: username = '${USER_TO_BE_SEARCHED}' + fields: + - first_name: + provider: + name: ${ACTION_TO_BE_TAKEN} + ``chunk_size`` ~~~~~~~~~~~~~~ diff --git a/pganonymizer/cli.py b/pganonymizer/cli.py index d05cd2f..bf1f60a 100644 --- a/pganonymizer/cli.py +++ b/pganonymizer/cli.py @@ -6,11 +6,9 @@ import logging import time -import yaml - from pganonymizer.constants import DATABASE_ARGS, DEFAULT_SCHEMA_FILE from pganonymizer.providers import provider_registry -from pganonymizer.utils import anonymize_tables, create_database_dump, get_connection, truncate_tables +from pganonymizer.utils import anonymize_tables, create_database_dump, get_connection, load_config, truncate_tables def get_pg_args(args): @@ -64,7 +62,7 @@ def main(args): list_provider_classes() return 0 - schema = yaml.load(open(args.schema), Loader=yaml.FullLoader) + schema = load_config(args.schema) pg_args = get_pg_args(args) connection = get_connection(pg_args) diff --git a/pganonymizer/utils.py b/pganonymizer/utils.py index 188bce5..8fe42e0 100644 --- a/pganonymizer/utils.py +++ b/pganonymizer/utils.py @@ -5,6 +5,7 @@ import json import logging import math +import os import re import subprocess import time @@ -15,6 +16,7 @@ from pgcopy import CopyManager from psycopg2.sql import SQL, Composed, Identifier from tqdm import trange +import yaml from pganonymizer.constants import DEFAULT_CHUNK_SIZE, DEFAULT_PRIMARY_KEY from pganonymizer.providers import provider_registry @@ -155,7 +157,7 @@ def create_temporary_table(connection, definitions, source_table, temp_table, pr FROM {source_table} WITH NO DATA""") cursor = connection.cursor() cursor.execute(ctas_query.format(temp_table=Identifier(temp_table), - source_table=Identifier(source_table), columns=sql_columns) + source_table=Identifier(source_table), columns=sql_columns) .as_string(connection) ) cursor.close() @@ -350,3 +352,33 @@ def nested_set(dic, path, value, delimiter='.'): for key in keys[:-1]: dic = dic.get(key, {}) dic[keys[-1]] = value + + +def load_config(schema): + # Original code from here https://gist.github.com/mkaranasou/ba83e25c835a8f7629e34dd7ede01931 + tag = '!ENV' + pattern = re.compile(r'.*?\${(\w+)}.*?') + custom_loader = yaml.FullLoader + custom_loader.add_implicit_resolver(tag, pattern, None) + + def constructor_env_variables(loader, node): + """ + Extracts the environment variable from the node's value + :param yaml.Loader loader: the yaml loader + :param node: the current node in the yaml + :return: the parsed string that contains the value of the environment + variable + """ + value = loader.construct_scalar(node) + match = pattern.findall(value) # to find all env variables in line + if match: + full_value = value + for g in match: + full_value = full_value.replace( + f'${{{g}}}', os.environ.get(g, g) + ) + return full_value + return value + + custom_loader.add_constructor(tag, constructor_env_variables) + return yaml.load(open(schema), Loader=custom_loader) diff --git a/tests/schemes/schema_with_env_variables.yml b/tests/schemes/schema_with_env_variables.yml new file mode 100644 index 0000000..efa0da4 --- /dev/null +++ b/tests/schemes/schema_with_env_variables.yml @@ -0,0 +1,13 @@ +primary_key: !ENV ${TEST_PRIMARY_KEY} +primary_key2: !ENV ${TEST_PRIMARY_KEY} +chunk_size: !ENV ${TEST_CHUNK_SIZE} +concat_missing: !ENV 'Hello, ${MISSING_ENV_VAL}' +concat_missing2: 'Hello, ${MISSING_ENV_VAL}' +concat_present: !ENV 'Hello, ${PRESENT_WORLD_NAME}' +concat_present2: ${PRESENT_WORLD_NAME} +concat_present3: Hello, ${PRESENT_WORLD_NAME} +search: id = ${COMPANY_ID} +search2: username = '${USER_TO_BE_SEARCHED}' +corrupted: username = '${CORRUPTED +corrupted2: !ENV +corrupted3: !ENV $ diff --git a/tests/test_utils.py b/tests/test_utils.py index e274856..981557b 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,5 +1,7 @@ import math +import os from collections import OrderedDict, namedtuple +from unittest import mock import pytest from mock import ANY, Mock, call, patch @@ -7,7 +9,7 @@ from tests.utils import quote_ident from pganonymizer.utils import (anonymize_tables, build_and_then_import_data, create_database_dump, - get_column_values, get_connection, import_data, truncate_tables) + get_column_values, get_connection, import_data, load_config, truncate_tables) class TestGetConnection: @@ -236,3 +238,41 @@ def test(self, mock_call): create_database_dump('/tmp/dump.gz', {'dbname': 'database', 'user': 'foo', 'host': 'localhost', 'port': 5432}) mock_call.assert_called_once_with('pg_dump -Fc -Z 9 -d database -U foo -h localhost -p 5432 -f /tmp/dump.gz', shell=True) + + +class TestConfigLoader: + + @pytest.mark.parametrize('file, envs, expected', [ + ['./tests/schemes/valid_schema.yml', {}, { + 'tables': [{'auth_user': {'primary_key': 'id', 'chunk_size': 5000, 'fields': [ + {'first_name': {'provider': {'name': 'fake.first_name'}}}, + {'last_name': {'provider': {'name': 'set', 'value': 'Bar'}}}, + {'email': {'provider': {'name': 'md5'}, 'append': '@localhost'}} + ], 'excludes': [{'email': ['\\S[^@]*@example\\.com']}]}}], 'truncate': ['django_session']}], + ['./tests/schemes/schema_with_env_variables.yml', { + "TEST_CHUNK_SIZE": "123", + "TEST_PRIMARY_KEY": "foo-bar", + "PRESENT_WORLD_NAME": "beautiful world", + "COMPANY_ID": "42", + "USER_TO_BE_SEARCHED": "i wanna be forgotten", + }, { + 'primary_key': 'foo-bar', + 'primary_key2': 'foo-bar', + 'chunk_size': '123', + 'concat_missing': 'Hello, MISSING_ENV_VAL', + 'concat_missing2': 'Hello, ${MISSING_ENV_VAL}', + 'concat_present': 'Hello, beautiful world', + 'concat_present2': 'beautiful world', + 'concat_present3': 'Hello, beautiful world', + 'search': 'id = 42', + 'search2': "username = 'i wanna be forgotten'", + 'corrupted': "username = '${CORRUPTED", + 'corrupted2': '', + 'corrupted3': '$' + } + ] + ]) + def test(self, file, envs, expected): + with mock.patch.dict(os.environ, envs): + print(load_config(file)) + assert load_config(file) == expected From 1b3e748999857adf836a896d7a20f29424813374 Mon Sep 17 00:00:00 2001 From: nurikk Date: Wed, 8 Dec 2021 09:56:18 +0800 Subject: [PATCH 03/12] Print query code during dry run --- pganonymizer/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pganonymizer/utils.py b/pganonymizer/utils.py index 188bce5..5f44edc 100644 --- a/pganonymizer/utils.py +++ b/pganonymizer/utils.py @@ -82,7 +82,8 @@ def build_and_then_import_data(connection, table, primary_key, columns, sql_select = Composed([sql_select, SQL(" WHERE {search_condition}".format(search_condition=search))]) if dry_run: sql_select = Composed([sql_select, SQL(" LIMIT 100")]) - cursor = connection.cursor(cursor_factory=psycopg2.extras.DictCursor, name='fetch_large_result') + logging.info(sql_select.as_string(connection)) + cursor = connection.cursor(cursor_factory=psycopg2.extras.DictCursor, name='fetch_large_result') cursor.execute(sql_select.as_string(connection)) temp_table = 'tmp_{table}'.format(table=table) create_temporary_table(connection, columns, table, temp_table, primary_key) From f91aaa76674ddadb9756483065b3a5da5dc1d4f5 Mon Sep 17 00:00:00 2001 From: nurikk Date: Wed, 8 Dec 2021 10:03:18 +0800 Subject: [PATCH 04/12] Trim trailing space --- pganonymizer/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pganonymizer/utils.py b/pganonymizer/utils.py index 432e691..ce795a6 100644 --- a/pganonymizer/utils.py +++ b/pganonymizer/utils.py @@ -85,7 +85,7 @@ def build_and_then_import_data(connection, table, primary_key, columns, if dry_run: sql_select = Composed([sql_select, SQL(" LIMIT 100")]) logging.info(sql_select.as_string(connection)) - cursor = connection.cursor(cursor_factory=psycopg2.extras.DictCursor, name='fetch_large_result') + cursor = connection.cursor(cursor_factory=psycopg2.extras.DictCursor, name='fetch_large_result') cursor.execute(sql_select.as_string(connection)) temp_table = 'tmp_{table}'.format(table=table) create_temporary_table(connection, columns, table, temp_table, primary_key) From 3be5365bebbc3c76b592ba82049dc00700b87881 Mon Sep 17 00:00:00 2001 From: Henning Kage Date: Mon, 13 Dec 2021 16:57:26 +0100 Subject: [PATCH 05/12] #39: Renamed project to pganonymize --- CHANGELOG.md | 2 ++ README.rst | 4 ++-- docs/conf.py | 20 ++++++++++---------- {pganonymizer => pganonymize}/__init__.py | 0 {pganonymizer => pganonymize}/__main__.py | 2 +- {pganonymizer => pganonymize}/cli.py | 6 +++--- {pganonymizer => pganonymize}/constants.py | 0 {pganonymizer => pganonymize}/exceptions.py | 0 {pganonymizer => pganonymize}/providers.py | 2 +- {pganonymizer => pganonymize}/utils.py | 4 ++-- {pganonymizer => pganonymize}/version.py | 0 pyproject.toml | 6 +++--- pytest.ini | 6 +++--- setup.py | 8 ++++---- tests/test_cli.py | 6 +++--- tests/test_providers.py | 4 ++-- tests/test_utils.py | 14 +++++++------- tox.ini | 2 +- 18 files changed, 44 insertions(+), 42 deletions(-) rename {pganonymizer => pganonymize}/__init__.py (100%) rename {pganonymizer => pganonymize}/__main__.py (84%) rename {pganonymizer => pganonymize}/cli.py (93%) rename {pganonymizer => pganonymize}/constants.py (100%) rename {pganonymizer => pganonymize}/exceptions.py (100%) rename {pganonymizer => pganonymize}/providers.py (97%) rename {pganonymizer => pganonymize}/utils.py (99%) rename {pganonymizer => pganonymize}/version.py (100%) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6f91dd0..4cf9875 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ ## Development +* [#39](https://github.com/rheinwerk-verlag/postgresql-anonymizer/issues/39): Renamed project to "pganonymize" + ## 0.7.0 (2021-11-30) * [#34](https://github.com/rheinwerk-verlag/postgresql-anonymizer/issues/34): Subprocess "run" being used on Python2.7 diff --git a/README.rst b/README.rst index e3adbad..b3af0d8 100644 --- a/README.rst +++ b/README.rst @@ -1,5 +1,5 @@ -PostgreSQL Anonymizer -===================== +pganonymize +=========== A commandline tool to anonymize PostgreSQL databases for DSGVO/GDPR purposes. diff --git a/docs/conf.py b/docs/conf.py index 3ba8a7c..fa56b9a 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # -# pganonymizer documentation build configuration file +# pganonymize documentation build configuration file # # This file is execfile()d with the current directory set to its # containing dir. @@ -33,7 +33,7 @@ # exec version.py instead of importing it. Importing may trigger unwanted # side-effects (if autodoc is used, the pypackage may be imported anyway). meta = {} -exec(open(os.path.join(project_root, 'pganonymizer', 'version.py')).read(), {}, meta) +exec(open(os.path.join(project_root, 'pganonymize', 'version.py')).read(), {}, meta) # -- General configuration --------------------------------------------- @@ -61,7 +61,7 @@ master_doc = 'index' # General information about the project. -project = u'PostgreSQL Anonymizer' +project = u'pganonymize' copyright = u'2019, Rheinwerk Verlag GmbH, Henning Kage' # The version info for the project you're documenting, acts as replacement @@ -214,8 +214,8 @@ # (source start file, target name, title, author, documentclass # [howto/manual]). latex_documents = [ - ('index', 'pganonymizer.tex', - u'PostgreSQL Anonymizer Documentation', + ('index', 'pganonymize.tex', + u'pganonymize Documentation', u'Henning Kage', 'manual'), ] @@ -245,8 +245,8 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - ('index', 'pganonymizer', - u'PostgreSQL Anonymizer Documentation', + ('index', 'pganonymize', + u'pganonymize Anonymizer Documentation', [u'Henning Kage'], 1) ] @@ -260,10 +260,10 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('index', 'pganonymizer', - u'PostgreSQL Anonymizer Documentation', + ('index', 'pganonymize', + u'pganonymize Documentation', u'Henning Kage', - 'pganonymizer', + 'pganonymize', 'A cookiecutter template for Rheinwerk Python packages', 'Miscellaneous'), ] diff --git a/pganonymizer/__init__.py b/pganonymize/__init__.py similarity index 100% rename from pganonymizer/__init__.py rename to pganonymize/__init__.py diff --git a/pganonymizer/__main__.py b/pganonymize/__main__.py similarity index 84% rename from pganonymizer/__main__.py rename to pganonymize/__main__.py index a18a742..382025f 100644 --- a/pganonymizer/__main__.py +++ b/pganonymize/__main__.py @@ -5,7 +5,7 @@ def main(): - from pganonymizer.cli import get_arg_parser, main + from pganonymize.cli import get_arg_parser, main try: args = get_arg_parser().parse_args() diff --git a/pganonymizer/cli.py b/pganonymize/cli.py similarity index 93% rename from pganonymizer/cli.py rename to pganonymize/cli.py index d05cd2f..cf5a33a 100644 --- a/pganonymizer/cli.py +++ b/pganonymize/cli.py @@ -8,9 +8,9 @@ import yaml -from pganonymizer.constants import DATABASE_ARGS, DEFAULT_SCHEMA_FILE -from pganonymizer.providers import provider_registry -from pganonymizer.utils import anonymize_tables, create_database_dump, get_connection, truncate_tables +from pganonymize.constants import DATABASE_ARGS, DEFAULT_SCHEMA_FILE +from pganonymize.providers import provider_registry +from pganonymize.utils import anonymize_tables, create_database_dump, get_connection, truncate_tables def get_pg_args(args): diff --git a/pganonymizer/constants.py b/pganonymize/constants.py similarity index 100% rename from pganonymizer/constants.py rename to pganonymize/constants.py diff --git a/pganonymizer/exceptions.py b/pganonymize/exceptions.py similarity index 100% rename from pganonymizer/exceptions.py rename to pganonymize/exceptions.py diff --git a/pganonymizer/providers.py b/pganonymize/providers.py similarity index 97% rename from pganonymizer/providers.py rename to pganonymize/providers.py index cec1535..41cad97 100644 --- a/pganonymizer/providers.py +++ b/pganonymize/providers.py @@ -7,7 +7,7 @@ from faker import Faker -from pganonymizer.exceptions import InvalidProvider, InvalidProviderArgument, ProviderAlreadyRegistered +from pganonymize.exceptions import InvalidProvider, InvalidProviderArgument, ProviderAlreadyRegistered fake_data = Faker() diff --git a/pganonymizer/utils.py b/pganonymize/utils.py similarity index 99% rename from pganonymizer/utils.py rename to pganonymize/utils.py index 188bce5..3a3ff1b 100644 --- a/pganonymizer/utils.py +++ b/pganonymize/utils.py @@ -16,8 +16,8 @@ from psycopg2.sql import SQL, Composed, Identifier from tqdm import trange -from pganonymizer.constants import DEFAULT_CHUNK_SIZE, DEFAULT_PRIMARY_KEY -from pganonymizer.providers import provider_registry +from pganonymize.constants import DEFAULT_CHUNK_SIZE, DEFAULT_PRIMARY_KEY +from pganonymize.providers import provider_registry def anonymize_tables(connection, definitions, verbose=False, dry_run=False): diff --git a/pganonymizer/version.py b/pganonymize/version.py similarity index 100% rename from pganonymizer/version.py rename to pganonymize/version.py diff --git a/pyproject.toml b/pyproject.toml index 9dfec34..838ebcb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [tool.poetry] -name = "postgresql-anonymizer" +name = "pganonymize" version = "0.7.0" description = "Commandline tool to anonymize PostgreSQL databases" authors = [ @@ -7,8 +7,8 @@ authors = [ ] license = "MIT" readme = "README.rst" -homepage = "https://github.com/rheinwerk-verlag/postgresql-anonymizer" -repository = "https://github.com/rheinwerk-verlag/postgresql-anonymizer" +homepage = "https://github.com/rheinwerk-verlag/pganonymize/" +repository = "https://github.com/rheinwerk-verlag/pganonymize.git" [tool.poetry.dependencies] python = "~2.7 || ^3.6" diff --git a/pytest.ini b/pytest.ini index e308cf3..58f54ac 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,4 +1,4 @@ [pytest] -addopts = --cov=pganonymizer --cov-report term-missing --cov-config setup.cfg -testpaths = tests pganonymizer -python_paths = pganonymizer +addopts = --cov=pganonymize --cov-report term-missing --cov-config setup.cfg +testpaths = tests pganonymize +python_paths = pganonymize diff --git a/setup.py b/setup.py index 64e43ff..f822c34 100755 --- a/setup.py +++ b/setup.py @@ -38,7 +38,7 @@ def run(self): return subprocess.call(['tox']) -exec(read('pganonymizer', 'version.py')) +exec(read('pganonymize', 'version.py')) install_requires = [ 'faker', @@ -73,7 +73,7 @@ def run(self): author_email='henning.kage@rheinwerk-verlag.de', maintainer='Henning Kage', maintainer_email='henning.kage@rheinwerk-verlag.de', - url='https://github.com/rheinwerk-verlag/postgresql-anonymizer', + url='https://github.com/rheinwerk-verlag/pganonymize', license='MIT license', classifiers=[ 'Development Status :: 3 - Alpha', @@ -92,7 +92,7 @@ def run(self): 'Environment :: Console', 'Topic :: Database' ], - packages=find_packages(include=['pganonymizer*']), + packages=find_packages(include=['pganonymize*']), include_package_data=True, install_requires=install_requires, tests_require=tests_require, @@ -101,7 +101,7 @@ def run(self): }, entry_points={ 'console_scripts': [ - 'pganonymize = pganonymizer.__main__:main' + 'pganonymize = pganonymize.__main__:main' ] } ) diff --git a/tests/test_cli.py b/tests/test_cli.py index f4961bd..02d821a 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -6,13 +6,13 @@ from tests.utils import quote_ident -from pganonymizer.cli import get_arg_parser, main +from pganonymize.cli import get_arg_parser, main class TestCli: @patch('psycopg2.extensions.quote_ident', side_effect=quote_ident) - @patch('pganonymizer.utils.psycopg2.connect') - @patch('pganonymizer.utils.subprocess') + @patch('pganonymize.utils.psycopg2.connect') + @patch('pganonymize.utils.subprocess') @pytest.mark.parametrize('cli_args, expected, expected_executes, commit_calls, call_dump', [ ['--host localhost --port 5432 --user root --password my-cool-password --dbname db --schema ./tests/schemes/valid_schema.yml -v --init-sql "set work_mem=\'1GB\'"', # noqa Namespace(verbose=1, list_providers=False, schema='./tests/schemes/valid_schema.yml', dbname='db', user='root', diff --git a/tests/test_providers.py b/tests/test_providers.py index a33e0f0..acf43b3 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -6,7 +6,7 @@ import six from mock import MagicMock, Mock, patch -from pganonymizer import exceptions, providers +from pganonymize import exceptions, providers def test_register(): @@ -121,7 +121,7 @@ class TestFakeProvider: ('fake.first_name', 'first_name'), ('fake.unique.first_name', 'unique.first_name'), ]) - @patch('pganonymizer.providers.fake_data') + @patch('pganonymize.providers.fake_data') def test_alter_value(self, mock_fake_data, name, function_name): provider = providers.FakeProvider(name=name) provider.alter_value('Foo') diff --git a/tests/test_utils.py b/tests/test_utils.py index e274856..938b54e 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -6,13 +6,13 @@ from tests.utils import quote_ident -from pganonymizer.utils import (anonymize_tables, build_and_then_import_data, create_database_dump, - get_column_values, get_connection, import_data, truncate_tables) +from pganonymize.utils import (anonymize_tables, build_and_then_import_data, create_database_dump, + get_column_values, get_connection, import_data, truncate_tables) class TestGetConnection: - @patch('pganonymizer.utils.psycopg2.connect') + @patch('pganonymize.utils.psycopg2.connect') def test(self, mock_connect): connection_data = { 'dbname': 'test', @@ -79,7 +79,7 @@ def test(self, inspect, util, quote_ident, tmp_table, cols, data): expected = [call('COPY "public"."src_tbl" ("id", "location") FROM STDIN WITH BINARY', ANY)] assert mock_cursor.copy_expert.call_args_list == expected - @patch('pganonymizer.utils.CopyManager') + @patch('pganonymize.utils.CopyManager') @patch('psycopg2.extensions.quote_ident', side_effect=quote_ident) def test_anonymize_tables(self, quote_ident, copy_manager): mock_cursor = Mock() @@ -163,7 +163,7 @@ def test_anonymize_tables(self, quote_ident, copy_manager): class TestBuildAndThenImport: @patch('psycopg2.extensions.quote_ident', side_effect=quote_ident) - @patch('pganonymizer.utils.CopyManager') + @patch('pganonymize.utils.CopyManager') @pytest.mark.parametrize('table, primary_key, columns, total_count, chunk_size', [ ['src_tbl', 'id', [{'col1': {'provider': {'name': 'md5'}}}, {'COL2': {'provider': {'name': 'md5'}}}], 10, 3] @@ -190,7 +190,7 @@ def test(self, quote_ident, copy_manager, table, primary_key, columns, total_cou call('UPDATE "src_tbl" t SET "col1" = s."col1", "COL2" = s."COL2" FROM "tmp_src_tbl" s WHERE t."id" = s."id"')] # noqa assert mock_cursor.execute.call_args_list == expected_execute_calls - @patch('pganonymizer.utils.CopyManager') + @patch('pganonymize.utils.CopyManager') def test_column_format(self, copy_manager): columns = [ { @@ -231,7 +231,7 @@ def test_column_format(self, copy_manager): class TestCreateDatabaseDump: - @patch('pganonymizer.utils.subprocess.call') + @patch('pganonymize.utils.subprocess.call') def test(self, mock_call): create_database_dump('/tmp/dump.gz', {'dbname': 'database', 'user': 'foo', 'host': 'localhost', 'port': 5432}) mock_call.assert_called_once_with('pg_dump -Fc -Z 9 -d database -U foo -h localhost -p 5432 -f /tmp/dump.gz', diff --git a/tox.ini b/tox.ini index c431537..9c36a12 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ skip_missing_interpreters=True [testenv:flake8] deps = flake8 -commands = flake8 {toxinidir}/pganonymizer {toxinidir}/tests +commands = flake8 {toxinidir}/pganonymize {toxinidir}/tests [testenv] whitelist_externals = poetry From 2a171c23acd06deb6415ae8c82764145a2367880 Mon Sep 17 00:00:00 2001 From: Henning Kage Date: Mon, 13 Dec 2021 17:03:19 +0100 Subject: [PATCH 06/12] #39: Renamed project to pganonymize --- .isort.cfg | 2 +- MANIFEST.in | 2 +- Makefile | 4 ++-- README.rst | 4 ++-- docs/Makefile | 8 +++---- docs/api.rst | 2 +- docs/conf.py | 2 +- docs/{pganonymizer.rst => pganonymize.rst} | 28 +++++++++++----------- pganonymize/providers.py | 2 +- 9 files changed, 27 insertions(+), 27 deletions(-) rename docs/{pganonymizer.rst => pganonymize.rst} (59%) diff --git a/.isort.cfg b/.isort.cfg index d3ff88a..5835cbb 100644 --- a/.isort.cfg +++ b/.isort.cfg @@ -3,6 +3,6 @@ line_length=120 multi_line_output=0 sections=FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,PGA,LOCALFOLDER default_section=THIRDPARTY -known_pga=pganonymizer +known_pga=pganonymize no_lines_before=LOCALFOLDER diff --git a/MANIFEST.in b/MANIFEST.in index 5f4027b..db2a0b2 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -2,4 +2,4 @@ include LICENSE.rst include README.rst include CHANGELOG.md -recursive-include pganonymizer *.html *.js *.css *.png *.gif*.jpg *.jpeg *.svg *.po \ No newline at end of file +recursive-include pganonymize *.html *.js *.css *.png *.gif*.jpg *.jpeg *.svg *.po diff --git a/Makefile b/Makefile index ea43309..ac94085 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ BROWSER ?= xdg-open -PYTHON_PACKAGE = pganonymizer +PYTHON_PACKAGE = pganonymize TESTS_PACKAGE = tests .PHONY: clean clean-test clean-pyc clean-build docs help @@ -61,4 +61,4 @@ test: @poetry run pytest --cov=poetry --cov-config .coveragerc tests/ -sq test-all: ## run tests on every Python version with tox - @tox \ No newline at end of file + @tox diff --git a/README.rst b/README.rst index b3af0d8..2bbbfe7 100644 --- a/README.rst +++ b/README.rst @@ -136,7 +136,7 @@ If you want to run the anonymizer within a Docker container you first have to bu .. code-block:: sh - $ docker build -t pganonymizer . + $ docker build -t pganonymize . After that you can pass a schema file to the container, using Docker volumes, and call the anonymizer: @@ -144,7 +144,7 @@ After that you can pass a schema file to the container, using Docker volumes, an $ docker run \ -v :/schema.yml \ - -it pganonymizer \ + -it pganonymize \ /usr/local/bin/pganonymize \ --schema=/schema.yml \ --dbname= \ diff --git a/docs/Makefile b/docs/Makefile index b6f4929..9cb0448 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -85,17 +85,17 @@ qthelp: @echo @echo "Build finished; now you can run "qcollectiongenerator" with the" \ ".qhcp project file in $(BUILDDIR)/qthelp, like this:" - @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pganonymizer.qhcp" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pganonymize.qhcp" @echo "To view the help file:" - @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pganonymizer.qhc" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pganonymize.qhc" devhelp: $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp @echo @echo "Build finished." @echo "To view the help file:" - @echo "# mkdir -p $$HOME/.local/share/devhelp/pganonymizer" - @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pganonymizer" + @echo "# mkdir -p $$HOME/.local/share/devhelp/pganonymize" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pganonymize" @echo "# devhelp" epub: diff --git a/docs/api.rst b/docs/api.rst index 47b33ee..4ac86b4 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -4,4 +4,4 @@ API .. toctree:: :maxdepth: 4 - pganonymizer + pganonymize diff --git a/docs/conf.py b/docs/conf.py index fa56b9a..4f535cd 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -194,7 +194,7 @@ #html_file_suffix = None # Output file base name for HTML help builder. -htmlhelp_basename = 'pganonymizerdoc' +htmlhelp_basename = 'pganonymizedoc' # -- Options for LaTeX output ------------------------------------------ diff --git a/docs/pganonymizer.rst b/docs/pganonymize.rst similarity index 59% rename from docs/pganonymizer.rst rename to docs/pganonymize.rst index e210de0..f460a56 100644 --- a/docs/pganonymizer.rst +++ b/docs/pganonymize.rst @@ -1,53 +1,53 @@ -pganonymizer package +pganonymize package ==================== Submodules ---------- -pganonymizer.cli module +pganonymize.cli module ----------------------- -.. automodule:: pganonymizer.cli +.. automodule:: pganonymize.cli :members: :undoc-members: :show-inheritance: -pganonymizer.constants module +pganonymize.constants module ----------------------------- -.. automodule:: pganonymizer.constants +.. automodule:: pganonymize.constants :members: :undoc-members: :show-inheritance: -pganonymizer.exceptions module +pganonymize.exceptions module ------------------------------ -.. automodule:: pganonymizer.exceptions +.. automodule:: pganonymize.exceptions :members: :undoc-members: :show-inheritance: -pganonymizer.providers module +pganonymize.providers module ----------------------------- -.. automodule:: pganonymizer.providers +.. automodule:: pganonymize.providers :members: :undoc-members: :show-inheritance: -pganonymizer.utils module +pganonymize.utils module ------------------------- -.. automodule:: pganonymizer.utils +.. automodule:: pganonymize.utils :members: :undoc-members: :show-inheritance: -pganonymizer.version module +pganonymize.version module --------------------------- -.. automodule:: pganonymizer.version +.. automodule:: pganonymize.version :members: :undoc-members: :show-inheritance: @@ -56,7 +56,7 @@ pganonymizer.version module Module contents --------------- -.. automodule:: pganonymizer +.. automodule:: pganonymize :members: :undoc-members: :show-inheritance: diff --git a/pganonymize/providers.py b/pganonymize/providers.py index 41cad97..f27e420 100644 --- a/pganonymize/providers.py +++ b/pganonymize/providers.py @@ -22,7 +22,7 @@ def register(self, provider_class, provider_id): """ Register a provider class. - :param pganonymizer.providers.Provider provider_class: Provider class that should be registered + :param pganonymize.providers.Provider provider_class: Provider class that should be registered :param str provider_id: A string id to register the provider for :raises ProviderAlreadyRegistered: If another provider with the given id has been registered """ From fd00b80cd60e0489bf8b48b6343f995cf49742c1 Mon Sep 17 00:00:00 2001 From: Henning Kage Date: Mon, 13 Dec 2021 17:05:12 +0100 Subject: [PATCH 07/12] #39: Updated the documentation links --- README.rst | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/README.rst b/README.rst index 2bbbfe7..91bc1ee 100644 --- a/README.rst +++ b/README.rst @@ -3,7 +3,8 @@ pganonymize A commandline tool to anonymize PostgreSQL databases for DSGVO/GDPR purposes. -It uses a YAML file to define which tables and fields should be anonymized and provides various methods of anonymization. The tool requires a direct PostgreSQL connection to perform the anonymization. +It uses a YAML file to define which tables and fields should be anonymized and provides various methods of +anonymization. The tool requires a direct PostgreSQL connection to perform the anonymization. .. class:: no-web no-pdf @@ -157,13 +158,13 @@ After that you can pass a schema file to the container, using Docker volumes, an .. _uuid4: https://www.postgresql.org/docs/current/datatype-uuid.html .. _documentation: https://python-postgresql-anonymizer.readthedocs.io/en/latest/ .. _schema documentation: https://python-postgresql-anonymizer.readthedocs.io/en/latest/schema.html -.. _YAML sample schema: https://github.com/rheinwerk-verlag/postgresql-anonymizer/blob/master/sample_schema.yml +.. _YAML sample schema: https://github.com/rheinwerk-verlag/pganonymize/blob/master/sample_schema.yml .. |python| image:: https://img.shields.io/pypi/pyversions/pganonymize :alt: PyPI - Python Version .. |license| image:: https://img.shields.io/badge/license-MIT-green.svg - :target: https://github.com/rheinwerk-verlag/postgresql-anonymizer/blob/master/LICENSE.rst + :target: https://github.com/rheinwerk-verlag/pganonymize/blob/master/LICENSE.rst .. |pypi| image:: https://badge.fury.io/py/pganonymize.svg :target: https://badge.fury.io/py/pganonymize @@ -173,7 +174,7 @@ After that you can pass a schema file to the container, using Docker volumes, an :alt: Download count .. |build| image:: https://github.com/rheinwerk-verlag/postgresql-anonymizer/workflows/Test/badge.svg - :target: https://github.com/rheinwerk-verlag/postgresql-anonymizer/actions + :target: https://github.com/rheinwerk-verlag/pganonymize/actions .. |health| image:: https://snyk.io/advisor/python/pganonymize/badge.svg :target: https://snyk.io/advisor/python/pganonymize From bfd4c89579ed9b8bdb5e45e86085c2c570b3acf3 Mon Sep 17 00:00:00 2001 From: Henning Kage Date: Tue, 14 Dec 2021 00:46:14 +0100 Subject: [PATCH 08/12] Added changelog entry --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6f91dd0..880bd08 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ ## Development +* [#38](https://github.com/rheinwerk-verlag/pganonymize/pull/38): Allow environment variables in schema definition ([nurikk](https://github.com/nurikk)) + ## 0.7.0 (2021-11-30) * [#34](https://github.com/rheinwerk-verlag/postgresql-anonymizer/issues/34): Subprocess "run" being used on Python2.7 From 45e43a82331475dbb7d38cffe7a39af4db832445 Mon Sep 17 00:00:00 2001 From: Henning Kage Date: Tue, 14 Dec 2021 00:50:31 +0100 Subject: [PATCH 09/12] #39: Fixed import after merging with development --- pganonymize/cli.py | 6 +++--- pganonymize/utils.py | 2 +- tests/test_utils.py | 14 +++++++------- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/pganonymize/cli.py b/pganonymize/cli.py index bf1f60a..dbf22a6 100644 --- a/pganonymize/cli.py +++ b/pganonymize/cli.py @@ -6,9 +6,9 @@ import logging import time -from pganonymizer.constants import DATABASE_ARGS, DEFAULT_SCHEMA_FILE -from pganonymizer.providers import provider_registry -from pganonymizer.utils import anonymize_tables, create_database_dump, get_connection, load_config, truncate_tables +from pganonymize.constants import DATABASE_ARGS, DEFAULT_SCHEMA_FILE +from pganonymize.providers import provider_registry +from pganonymize.utils import anonymize_tables, create_database_dump, get_connection, load_config, truncate_tables def get_pg_args(args): diff --git a/pganonymize/utils.py b/pganonymize/utils.py index 5df9f49..45e6fd8 100644 --- a/pganonymize/utils.py +++ b/pganonymize/utils.py @@ -13,10 +13,10 @@ import parmap import psycopg2 import psycopg2.extras +import yaml from pgcopy import CopyManager from psycopg2.sql import SQL, Composed, Identifier from tqdm import trange -import yaml from pganonymize.constants import DEFAULT_CHUNK_SIZE, DEFAULT_PRIMARY_KEY from pganonymize.providers import provider_registry diff --git a/tests/test_utils.py b/tests/test_utils.py index 981557b..c0afc79 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -8,13 +8,13 @@ from tests.utils import quote_ident -from pganonymizer.utils import (anonymize_tables, build_and_then_import_data, create_database_dump, - get_column_values, get_connection, import_data, load_config, truncate_tables) +from pganonymize.utils import (anonymize_tables, build_and_then_import_data, create_database_dump, + get_column_values, get_connection, import_data, load_config, truncate_tables) class TestGetConnection: - @patch('pganonymizer.utils.psycopg2.connect') + @patch('pganonymize.utils.psycopg2.connect') def test(self, mock_connect): connection_data = { 'dbname': 'test', @@ -81,7 +81,7 @@ def test(self, inspect, util, quote_ident, tmp_table, cols, data): expected = [call('COPY "public"."src_tbl" ("id", "location") FROM STDIN WITH BINARY', ANY)] assert mock_cursor.copy_expert.call_args_list == expected - @patch('pganonymizer.utils.CopyManager') + @patch('pganonymize.utils.CopyManager') @patch('psycopg2.extensions.quote_ident', side_effect=quote_ident) def test_anonymize_tables(self, quote_ident, copy_manager): mock_cursor = Mock() @@ -165,7 +165,7 @@ def test_anonymize_tables(self, quote_ident, copy_manager): class TestBuildAndThenImport: @patch('psycopg2.extensions.quote_ident', side_effect=quote_ident) - @patch('pganonymizer.utils.CopyManager') + @patch('pganonymize.utils.CopyManager') @pytest.mark.parametrize('table, primary_key, columns, total_count, chunk_size', [ ['src_tbl', 'id', [{'col1': {'provider': {'name': 'md5'}}}, {'COL2': {'provider': {'name': 'md5'}}}], 10, 3] @@ -192,7 +192,7 @@ def test(self, quote_ident, copy_manager, table, primary_key, columns, total_cou call('UPDATE "src_tbl" t SET "col1" = s."col1", "COL2" = s."COL2" FROM "tmp_src_tbl" s WHERE t."id" = s."id"')] # noqa assert mock_cursor.execute.call_args_list == expected_execute_calls - @patch('pganonymizer.utils.CopyManager') + @patch('pganonymize.utils.CopyManager') def test_column_format(self, copy_manager): columns = [ { @@ -233,7 +233,7 @@ def test_column_format(self, copy_manager): class TestCreateDatabaseDump: - @patch('pganonymizer.utils.subprocess.call') + @patch('pganonymize.utils.subprocess.call') def test(self, mock_call): create_database_dump('/tmp/dump.gz', {'dbname': 'database', 'user': 'foo', 'host': 'localhost', 'port': 5432}) mock_call.assert_called_once_with('pg_dump -Fc -Z 9 -d database -U foo -h localhost -p 5432 -f /tmp/dump.gz', From d8920e36060bfa8785b15d6906dfa2776e3b13f7 Mon Sep 17 00:00:00 2001 From: Henning Kage Date: Tue, 14 Dec 2021 01:04:36 +0100 Subject: [PATCH 10/12] #39: Added a new link page --- docs/index.rst | 1 + docs/links.rst | 14 ++++++++++++++ 2 files changed, 15 insertions(+) create mode 100644 docs/links.rst diff --git a/docs/index.rst b/docs/index.rst index 363728d..091dd01 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -14,6 +14,7 @@ Contents: deploy license changelog + links Indices and tables diff --git a/docs/links.rst b/docs/links.rst new file mode 100644 index 0000000..c2c9794 --- /dev/null +++ b/docs/links.rst @@ -0,0 +1,14 @@ +Links +===== + +The following links refer to projects that have a similar purpose of anonymizing a PostgreSQL database. Thanks to the +authors of these projects. Some of them inspired the author of this project, e.g. `pgantomizer`_ for using a human +readable declaration file in YAML. + +* `PostgreSQL Anonymizer`_: Anonymization & Data Masking for PostgreSQL +* `pg-anonymizer`_: Dump anonymized PostgreSQL database with a NodeJS CLI +* `pgantomizer`_: Anonymize data in your PostgreSQL dabatase with ease + +.. _PostgreSQL Anonymizer: https://gitlab.com/dalibo/postgresql_anonymizer +.. _pg-anonymizer: https://github.com/rap2hpoutre/pg-anonymizer +.. _pgantomizer: https://github.com/asgeirrr/pgantomizer From a1c8c5edc9a99e3123fc6996a449086baecebd98 Mon Sep 17 00:00:00 2001 From: Henning Kage Date: Tue, 14 Dec 2021 01:08:41 +0100 Subject: [PATCH 11/12] #39: Fixed documentation warning --- docs/schema.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/schema.rst b/docs/schema.rst index 1706013..e3d9421 100644 --- a/docs/schema.rst +++ b/docs/schema.rst @@ -122,6 +122,7 @@ YAML schema file supports placeholders with environment variables, ex: So you can construct dynamic filter conditions like: .. code-block:: sh + $ export COMPANY_ID=123 $ export ACTION_TO_BE_TAKEN=clear From 78fbda781ab379622d6a596d730945ca398cca12 Mon Sep 17 00:00:00 2001 From: Henning Kage Date: Tue, 15 Mar 2022 09:56:29 +0100 Subject: [PATCH 12/12] Release 0.8.0 --- CHANGELOG.md | 2 ++ pganonymize/version.py | 2 +- pyproject.toml | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2370011..365ac84 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ ## Development +## 0.8.0 (2022-03-15) + * [#39](https://github.com/rheinwerk-verlag/pganonymize/issues/39): Renamed project to "pganonymize" * [#38](https://github.com/rheinwerk-verlag/pganonymize/pull/38): Allow environment variables in schema definition ([nurikk](https://github.com/nurikk)) diff --git a/pganonymize/version.py b/pganonymize/version.py index 82dc6dc..0c6d7b8 100644 --- a/pganonymize/version.py +++ b/pganonymize/version.py @@ -1,3 +1,3 @@ # -*- coding: utf-8 -*- -__version__ = '0.7.0' +__version__ = '0.8.0' diff --git a/pyproject.toml b/pyproject.toml index 838ebcb..9557349 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pganonymize" -version = "0.7.0" +version = "0.8.0" description = "Commandline tool to anonymize PostgreSQL databases" authors = [ "Henning Kage "