From 79e6415f4a6fb0ae815cd0aedc9cff86c702b799 Mon Sep 17 00:00:00 2001 From: Henning Kage Date: Fri, 25 Nov 2022 12:00:11 +0100 Subject: [PATCH 01/15] #47: Basic implementation for the faker localization modes --- pganonymize/cli.py | 9 +++-- pganonymize/config.py | 57 ++++++++++++++++++++++++++++ pganonymize/providers.py | 81 +++++++++++++++++++++++++++------------- pganonymize/utils.py | 51 +++++-------------------- 4 files changed, 128 insertions(+), 70 deletions(-) create mode 100644 pganonymize/config.py diff --git a/pganonymize/cli.py b/pganonymize/cli.py index dbf22a63..4a85c6ba 100644 --- a/pganonymize/cli.py +++ b/pganonymize/cli.py @@ -6,9 +6,10 @@ import logging import time +from pganonymize.config import config from pganonymize.constants import DATABASE_ARGS, DEFAULT_SCHEMA_FILE from pganonymize.providers import provider_registry -from pganonymize.utils import anonymize_tables, create_database_dump, get_connection, load_config, truncate_tables +from pganonymize.utils import anonymize_tables, create_database_dump, get_connection, truncate_tables def get_pg_args(args): @@ -62,7 +63,7 @@ def main(args): list_provider_classes() return 0 - schema = load_config(args.schema) + config.schema_file = args.schema pg_args = get_pg_args(args) connection = get_connection(pg_args) @@ -73,8 +74,8 @@ def main(args): cursor.close() start_time = time.time() - truncate_tables(connection, schema.get('truncate', [])) - anonymize_tables(connection, schema.get('tables', []), verbose=args.verbose, dry_run=args.dry_run) + truncate_tables(connection) + anonymize_tables(connection, verbose=args.verbose, dry_run=args.dry_run) if not args.dry_run: connection.commit() diff --git a/pganonymize/config.py b/pganonymize/config.py new file mode 100644 index 00000000..1fed7adc --- /dev/null +++ b/pganonymize/config.py @@ -0,0 +1,57 @@ +import os +import re + +import yaml + + +class Config(object): + """A class that wraps access to the given YAML schema file.""" + + def __init__(self): + self._schema = None + self.schema_file = None + + @property + def schema(self): + """ + Return the schema loaded from the given YAML schema file. + + :return: + :rtype: + """ + if self._schema is None and self.schema_file is not None: + self._schema = load_schema(self.schema_file) + return self._schema + + +def load_schema(schema_file): + # Original code from here https://gist.github.com/mkaranasou/ba83e25c835a8f7629e34dd7ede01931 + tag = '!ENV' + pattern = re.compile(r'.*?\${(\w+)}.*?') + custom_loader = yaml.FullLoader + custom_loader.add_implicit_resolver(tag, pattern, None) + + def constructor_env_variables(loader, node): + """ + Extract the environment variable from the node's value. + + :param yaml.Loader loader: The yaml loader + :param node: The current node in the yaml + :return: The parsed string that contains the value of the environment variable + """ + value = loader.construct_scalar(node) + match = pattern.findall(value) # to find all env variables in line + if match: + full_value = value + for g in match: + full_value = full_value.replace( + '${{{g}}}'.format(g=g), os.environ.get(g, g) + ) + return full_value + return value + + custom_loader.add_constructor(tag, constructor_env_variables) + return yaml.load(open(schema_file), Loader=custom_loader) + + +config = Config() diff --git a/pganonymize/providers.py b/pganonymize/providers.py index 00385215..de3466cc 100644 --- a/pganonymize/providers.py +++ b/pganonymize/providers.py @@ -7,9 +7,32 @@ from faker import Faker +from pganonymize.config import config from pganonymize.exceptions import InvalidProvider, InvalidProviderArgument, ProviderAlreadyRegistered -fake_data = Faker() + +class FakerInitializer(object): + """A wrapper that allows to instantiate a faker instance with specific locales.""" + + def __init__(self): + self._faker = None + + @property + def faker(self): + """ + Return the actual :class:`faker.Faker` instance, with optional locales taken from the YAML schema. + + :return: A faker instance + :rtype: faker.Faker + """ + if self._faker is None: + options = config.schema.get('options', {}) + locales = options.get('faker', {}).get('locales', None) + self._faker = Faker(locales) + return self._faker + + +faker_initializer = FakerInitializer() class ProviderRegistry(object): @@ -33,10 +56,12 @@ def register(self, provider_class, provider_id): def get_provider(self, provider_id): """ - Return a provider by it's provider id. + Return a provider by its provider id. :param str provider_id: The string id of the desired provider. :raises InvalidProvider: If no provider can be found with the given id. + :return: The provider class that matches the id. + :rtype: type """ for key, cls in self._registry.items(): if (cls.regex_match is True and re.match(re.compile(key), provider_id) is not None) or key == provider_id: @@ -78,10 +103,8 @@ class Provider(object): regex_match = False """Defines whether a provider matches it's id using regular expressions.""" - def __init__(self, **kwargs): - self.kwargs = kwargs - - def alter_value(self, value): + @classmethod + def alter_value(cls, value, **kwargs): """ Alter or replace the original value of the database column. @@ -94,15 +117,17 @@ def alter_value(self, value): class ChoiceProvider(Provider): """Provider that returns a random value from a list of choices.""" - def alter_value(self, value): - return random.choice(self.kwargs.get('values')) + @classmethod + def alter_value(cls, value, **kwargs): + return random.choice(kwargs.get('values')) @register('clear') class ClearProvider(Provider): """Provider to set a field value to None.""" - def alter_value(self, value): + @classmethod + def alter_value(cls, value, **kwargs): return None @@ -112,11 +137,12 @@ class FakeProvider(Provider): regex_match = True - def alter_value(self, value): - func_name = self.kwargs['name'].split('.', 1)[1] - func_kwargs = self.kwargs.get('kwargs', {}) + @classmethod + def alter_value(cls, value, **kwargs): + func_name = kwargs['name'].split('.', 1)[1] + func_kwargs = kwargs.get('kwargs', {}) try: - func = operator.attrgetter(func_name)(fake_data) + func = operator.attrgetter(func_name)(faker_initializer.faker) except AttributeError as exc: raise InvalidProviderArgument(exc) return func(**func_kwargs) @@ -129,8 +155,9 @@ class MaskProvider(Provider): default_sign = 'X' """The default string used to replace each character.""" - def alter_value(self, value): - sign = self.kwargs.get('sign', self.default_sign) or self.default_sign + @classmethod + def alter_value(cls, value, **kwargs): + sign = kwargs.get('sign', cls.default_sign) or cls.default_sign return sign * len(value) @@ -143,10 +170,11 @@ class PartialMaskProvider(Provider): default_unmasked_right = 1 """The default string used to replace each character.""" - def alter_value(self, value): - sign = self.kwargs.get('sign', self.default_sign) or self.default_sign - unmasked_left = self.kwargs.get('unmasked_left', self.default_unmasked_left) or self.default_unmasked_left - unmasked_right = self.kwargs.get('unmasked_right', self.default_unmasked_right) or self.default_unmasked_right + @classmethod + def alter_value(cls, value, **kwargs): + sign = kwargs.get('sign', cls.default_sign) or cls.default_sign + unmasked_left = kwargs.get('unmasked_left', cls.default_unmasked_left) or cls.default_unmasked_left + unmasked_right = kwargs.get('unmasked_right', cls.default_unmasked_right) or cls.default_unmasked_right return ( value[:unmasked_left] + @@ -162,9 +190,10 @@ class MD5Provider(Provider): default_max_length = 8 """The default length used for the number representation.""" - def alter_value(self, value): - as_number = self.kwargs.get('as_number', False) - as_number_length = self.kwargs.get('as_number_length', self.default_max_length) + @classmethod + def alter_value(cls, value, **kwargs): + as_number = kwargs.get('as_number', False) + as_number_length = kwargs.get('as_number_length', cls.default_max_length) hashed = md5(value.encode('utf-8')).hexdigest() if as_number: return int(hashed, 16) % (10 ** as_number_length) @@ -176,13 +205,15 @@ def alter_value(self, value): class SetProvider(Provider): """Provider to set a static value.""" - def alter_value(self, value): - return self.kwargs.get('value') + @classmethod + def alter_value(cls, value, **kwargs): + return kwargs.get('value') @register('uuid4') class UUID4Provider(Provider): """Provider to set a random uuid value.""" - def alter_value(self, value): + @classmethod + def alter_value(cls, value, **kwargs): return uuid4() diff --git a/pganonymize/utils.py b/pganonymize/utils.py index f0b31e8a..6689ba94 100644 --- a/pganonymize/utils.py +++ b/pganonymize/utils.py @@ -5,7 +5,6 @@ import json import logging import math -import os import re import subprocess import time @@ -13,24 +12,24 @@ import parmap import psycopg2 import psycopg2.extras -import yaml from pgcopy import CopyManager from psycopg2.sql import SQL, Composed, Identifier from tqdm import trange +from pganonymize.config import config from pganonymize.constants import DEFAULT_CHUNK_SIZE, DEFAULT_PRIMARY_KEY from pganonymize.providers import provider_registry -def anonymize_tables(connection, definitions, verbose=False, dry_run=False): +def anonymize_tables(connection, verbose=False, dry_run=False): """ Anonymize a list of tables according to the schema definition. :param connection: A database connection instance. - :param list definitions: A list of table definitions from the YAML schema. :param bool verbose: Display logging information and a progress bar. :param bool dry_run: Script is running in dry-run mode, no commit expected. """ + definitions = config.schema.get('tables', []) for definition in definitions: start_time = time.time() table_name = list(definition.keys())[0] @@ -233,26 +232,26 @@ def get_column_values(row, columns): orig_value = nested_get(row, full_column_name) # Skip the current column if there is no value to be altered if orig_value is not None: - provider = provider_registry.get_provider(provider_config['name'])(**provider_config) - value = provider.alter_value(orig_value) + provider_class = provider_registry.get_provider(provider_config['name']) + value = provider_class.alter_value(orig_value, **provider_config) append = column_definition.get('append') if append: value = value + append - format = column_definition.get('format') - if format: - value = format.format(pga_value=value, **row) + _format = column_definition.get('format') + if _format: + value = _format.format(pga_value=value, **row) nested_set(row, full_column_name, value) column_dict[column_name] = nested_get(row, column_name) return column_dict -def truncate_tables(connection, tables): +def truncate_tables(connection): """ Truncate a list of tables. :param connection: A database connection instance - :param list[str] tables: A list of table names """ + tables = config.schema.get('truncate', []) if not tables: return cursor = connection.cursor() @@ -356,33 +355,3 @@ def nested_set(dic, path, value, delimiter='.'): for key in keys[:-1]: dic = dic.get(key, {}) dic[keys[-1]] = value - - -def load_config(schema): - # Original code from here https://gist.github.com/mkaranasou/ba83e25c835a8f7629e34dd7ede01931 - tag = '!ENV' - pattern = re.compile(r'.*?\${(\w+)}.*?') - custom_loader = yaml.FullLoader - custom_loader.add_implicit_resolver(tag, pattern, None) - - def constructor_env_variables(loader, node): - """ - Extract the environment variable from the node's value. - - :param yaml.Loader loader: The yaml loader - :param node: The current node in the yaml - :return: The parsed string that contains the value of the environment variable - """ - value = loader.construct_scalar(node) - match = pattern.findall(value) # to find all env variables in line - if match: - full_value = value - for g in match: - full_value = full_value.replace( - '${{{g}}}'.format(g=g), os.environ.get(g, g) - ) - return full_value - return value - - custom_loader.add_constructor(tag, constructor_env_variables) - return yaml.load(open(schema), Loader=custom_loader) From bddd2d59cb73f57ed6f7edda58056942ca41cdfb Mon Sep 17 00:00:00 2001 From: Henning Kage Date: Fri, 25 Nov 2022 16:37:04 +0100 Subject: [PATCH 02/15] #47: Ranamed the `alter_value` parameter to prevent name clashing with `value` kwargs --- pganonymize/providers.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/pganonymize/providers.py b/pganonymize/providers.py index de3466cc..1dd65a89 100644 --- a/pganonymize/providers.py +++ b/pganonymize/providers.py @@ -104,11 +104,11 @@ class Provider(object): """Defines whether a provider matches it's id using regular expressions.""" @classmethod - def alter_value(cls, value, **kwargs): + def alter_value(cls, original_value, **kwargs): """ Alter or replace the original value of the database column. - :param value: The original value of the database column. + :param original_value: The original value of the database column. """ raise NotImplementedError() @@ -118,7 +118,7 @@ class ChoiceProvider(Provider): """Provider that returns a random value from a list of choices.""" @classmethod - def alter_value(cls, value, **kwargs): + def alter_value(cls, original_value, **kwargs): return random.choice(kwargs.get('values')) @@ -127,7 +127,7 @@ class ClearProvider(Provider): """Provider to set a field value to None.""" @classmethod - def alter_value(cls, value, **kwargs): + def alter_value(cls, original_value, **kwargs): return None @@ -138,7 +138,7 @@ class FakeProvider(Provider): regex_match = True @classmethod - def alter_value(cls, value, **kwargs): + def alter_value(cls, original_value, **kwargs): func_name = kwargs['name'].split('.', 1)[1] func_kwargs = kwargs.get('kwargs', {}) try: @@ -156,9 +156,9 @@ class MaskProvider(Provider): """The default string used to replace each character.""" @classmethod - def alter_value(cls, value, **kwargs): + def alter_value(cls, original_value, **kwargs): sign = kwargs.get('sign', cls.default_sign) or cls.default_sign - return sign * len(value) + return sign * len(original_value) @register('partial_mask') @@ -171,15 +171,15 @@ class PartialMaskProvider(Provider): """The default string used to replace each character.""" @classmethod - def alter_value(cls, value, **kwargs): + def alter_value(cls, original_value, **kwargs): sign = kwargs.get('sign', cls.default_sign) or cls.default_sign unmasked_left = kwargs.get('unmasked_left', cls.default_unmasked_left) or cls.default_unmasked_left unmasked_right = kwargs.get('unmasked_right', cls.default_unmasked_right) or cls.default_unmasked_right return ( - value[:unmasked_left] + - (len(value) - (unmasked_left + unmasked_right)) * sign + - value[-unmasked_right:] + original_value[:unmasked_left] + + (len(original_value) - (unmasked_left + unmasked_right)) * sign + + original_value[-unmasked_right:] ) @@ -191,10 +191,10 @@ class MD5Provider(Provider): """The default length used for the number representation.""" @classmethod - def alter_value(cls, value, **kwargs): + def alter_value(cls, original_value, **kwargs): as_number = kwargs.get('as_number', False) as_number_length = kwargs.get('as_number_length', cls.default_max_length) - hashed = md5(value.encode('utf-8')).hexdigest() + hashed = md5(original_value.encode('utf-8')).hexdigest() if as_number: return int(hashed, 16) % (10 ** as_number_length) else: @@ -206,7 +206,7 @@ class SetProvider(Provider): """Provider to set a static value.""" @classmethod - def alter_value(cls, value, **kwargs): + def alter_value(cls, original_value, **kwargs): return kwargs.get('value') @@ -215,5 +215,5 @@ class UUID4Provider(Provider): """Provider to set a random uuid value.""" @classmethod - def alter_value(cls, value, **kwargs): + def alter_value(cls, original_value, **kwargs): return uuid4() From 15ff5aaf6012259e230c46e1204ccd5c45e978c9 Mon Sep 17 00:00:00 2001 From: Henning Kage Date: Fri, 25 Nov 2022 16:38:06 +0100 Subject: [PATCH 03/15] #47: Fixed first tests --- tests/test_cli.py | 3 +- tests/test_config.py | 62 +++++++++++++++++++++++++++ tests/test_providers.py | 76 +++++++++++++-------------------- tests/test_utils.py | 93 +++++++++-------------------------------- 4 files changed, 114 insertions(+), 120 deletions(-) create mode 100644 tests/test_config.py diff --git a/tests/test_cli.py b/tests/test_cli.py index 02d821a3..616b6b5c 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -9,7 +9,8 @@ from pganonymize.cli import get_arg_parser, main -class TestCli: +class TestCli(object): + @patch('psycopg2.extensions.quote_ident', side_effect=quote_ident) @patch('pganonymize.utils.psycopg2.connect') @patch('pganonymize.utils.subprocess') diff --git a/tests/test_config.py b/tests/test_config.py new file mode 100644 index 00000000..17e4788c --- /dev/null +++ b/tests/test_config.py @@ -0,0 +1,62 @@ +import os + +import pytest +from mock import patch + +from pganonymize.config import load_schema + + +@pytest.mark.parametrize('file, envs, expected', [ + [ + './tests/schemes/valid_schema.yml', + {}, + { + 'tables': [ + { + 'auth_user': { + 'primary_key': 'id', + 'chunk_size': 5000, + 'fields': [ + {'first_name': {'provider': {'name': 'fake.first_name'}}}, + {'last_name': {'provider': {'name': 'set', 'value': 'Bar'}}}, + {'email': {'provider': {'name': 'md5'}, 'append': '@localhost'}}, + ], + 'excludes': [ + {'email': ['\\S[^@]*@example\\.com']}, + ] + } + } + ], + 'truncate': ['django_session'] + } + ], + [ + './tests/schemes/schema_with_env_variables.yml', + { + 'TEST_CHUNK_SIZE': '123', + 'TEST_PRIMARY_KEY': 'foo-bar', + 'PRESENT_WORLD_NAME': 'beautiful world', + 'COMPANY_ID': '42', + 'USER_TO_BE_SEARCHED': 'i wanna be forgotten', + }, + { + 'primary_key': 'foo-bar', + 'primary_key2': 'foo-bar', + 'chunk_size': '123', + 'concat_missing': 'Hello, MISSING_ENV_VAL', + 'concat_missing2': 'Hello, ${MISSING_ENV_VAL}', + 'concat_present': 'Hello, beautiful world', + 'concat_present2': 'beautiful world', + 'concat_present3': 'Hello, beautiful world', + 'search': 'id = 42', + 'search2': "username = 'i wanna be forgotten'", + 'corrupted': "username = '${CORRUPTED", + 'corrupted2': '', + 'corrupted3': '$', + } + ] +]) +def test_load_schema(file, envs, expected): + with patch.dict(os.environ, envs): + print(load_schema(file)) + assert load_schema(file) == expected diff --git a/tests/test_providers.py b/tests/test_providers.py index 354a53d5..70c24403 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -29,7 +29,7 @@ def alter_value(self, value): assert 'bar' in registry._registry -class TestProviderRegistry: +class TestProviderRegistry(object): def test_constructor(self): registry = providers.ProviderRegistry() @@ -91,7 +91,7 @@ def test_providers(self): pass -class TestProvider: +class TestProvider(object): def test_alter_value(self): provider = providers.Provider() @@ -99,59 +99,54 @@ def test_alter_value(self): provider.alter_value('Foo') -class TestChoiceProvider: +class TestChoiceProvider(object): def test_alter_value(self): choices = ['Foo', 'Bar', 'Baz'] - provider = providers.ChoiceProvider(values=choices) for choice in choices: - assert provider.alter_value(choice) in choices + assert providers.ChoiceProvider.alter_value(choice, values=choices) in choices -class TestClearProvider: +class TestClearProvider(object): def test_alter_value(self): provider = providers.ClearProvider() assert provider.alter_value('Foo') is None -class TestFakeProvider: +class TestFakeProvider(object): @pytest.mark.parametrize('name, function_name', [ ('fake.first_name', 'first_name'), ('fake.unique.first_name', 'unique.first_name'), ]) - @patch('pganonymize.providers.fake_data') - def test_alter_value(self, mock_fake_data, name, function_name): - provider = providers.FakeProvider(name=name) - provider.alter_value('Foo') - assert operator.attrgetter(function_name)(mock_fake_data).call_count == 1 + @patch('pganonymize.providers.faker_initializer') + def test_alter_value(self, mock_faker_initializer, name, function_name): + providers.FakeProvider.alter_value('Foo', name=name) + assert operator.attrgetter(function_name)(mock_faker_initializer.faker).call_count == 1 @pytest.mark.parametrize('name', ['fake.foo_name']) def test_invalid_names(self, name): - provider = providers.FakeProvider(name=name) with pytest.raises(exceptions.InvalidProviderArgument): - provider.alter_value('Foo') + providers.FakeProvider.alter_value('Foo', name=name) - @patch('pganonymize.providers.fake_data') - def test_alter_value_with_kwargs(self, mock_fake_data): - provider = providers.FakeProvider(name='fake.date_of_birth', kwargs={'minimum_age': 18}) - provider.alter_value('Foo') - assert mock_fake_data.date_of_birth.call_args == call(minimum_age=18) + @patch('pganonymize.providers.faker_initializer') + def test_alter_value_with_kwargs(self, mock_faker_initializer): + providers.FakeProvider.alter_value('Foo', name='fake.date_of_birth', kwargs={'minimum_age': 18}) + assert mock_faker_initializer.faker.date_of_birth.call_args == call(minimum_age=18) -class TestMaskProvider: +class TestMaskProvider(object): @pytest.mark.parametrize('value, sign, expected', [ ('Foo', None, 'XXX'), ('Baaaar', '?', '??????'), ]) def test_alter_value(self, value, sign, expected): - provider = providers.MaskProvider(sign=sign) - assert provider.alter_value(value) == expected + assert providers.MaskProvider.alter_value(value, sign=sign) == expected -class TestPartialMaskProvider: +class TestPartialMaskProvider(object): @pytest.mark.parametrize('value, sign, unmasked_left, unmasked_right, expected', [ ('Foo', None, 1, 1, 'FXo'), @@ -159,15 +154,11 @@ class TestPartialMaskProvider: ('Baaaar', '?', 2, 1, 'Ba???r'), ]) def test_alter_value(self, value, sign, unmasked_left, unmasked_right, expected): - provider = providers.PartialMaskProvider( - sign=sign, - unmasked_left=unmasked_left, - unmasked_right=unmasked_right - ) - assert provider.alter_value(value) == expected + assert providers.PartialMaskProvider.alter_value(value, sign=sign, unmasked_left=unmasked_left, + unmasked_right=unmasked_right) == expected -class TestMD5Provider: +class TestMD5Provider(object): def test_alter_value(self): provider = providers.MD5Provider() @@ -176,33 +167,26 @@ def test_alter_value(self): assert len(value) == 32 def test_as_number(self): - provider = providers.MD5Provider(as_number=True) - value = provider.alter_value('foo') + value = providers.MD5Provider.alter_value('foo', as_number=True) assert isinstance(value, six.integer_types) assert value == 985560 - - provider = providers.MD5Provider(as_number=True, as_number_length=8) - value = provider.alter_value('foobarbazadasd') + value = providers.MD5Provider.alter_value('foobarbazadasd', as_number=True, as_number_length=8) assert isinstance(value, six.integer_types) assert value == 45684001 -class TestSetProvider: +class TestSetProvider(object): @pytest.mark.parametrize('kwargs, expected', [ ({'value': None}, None), ({'value': 'Bar'}, 'Bar') ]) def test_alter_value(self, kwargs, expected): - provider = providers.SetProvider(**kwargs) - assert provider.alter_value('Foo') == expected + assert providers.SetProvider.alter_value('Foo', **kwargs) == expected -class TestUUID4Provider: - @pytest.mark.parametrize('kwargs, expected', [ - ({'value': None}, None), - ({'value': 'Bar'}, 'Bar') - ]) - def test_alter_value(self, kwargs, expected): - provider = providers.UUID4Provider(**kwargs) - assert type(provider.alter_value('Foo')) == uuid.UUID +class TestUUID4Provider(object): + + @pytest.mark.parametrize('value, expected', [(None, uuid.UUID), ('Foo', uuid.UUID)]) + def test_alter_value(self, value, expected): + assert type(providers.UUID4Provider.alter_value(value)) == expected diff --git a/tests/test_utils.py b/tests/test_utils.py index 9038ba98..9a2e3fb9 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,5 +1,4 @@ import math -import os from collections import OrderedDict, namedtuple import pytest @@ -7,11 +6,13 @@ from tests.utils import quote_ident -from pganonymize.utils import (anonymize_tables, build_and_then_import_data, create_database_dump, - get_column_values, get_connection, import_data, load_config, truncate_tables) +from pganonymize.utils import ( + anonymize_tables, build_and_then_import_data, create_database_dump, get_column_values, get_connection, import_data, + truncate_tables, +) -class TestGetConnection: +class TestGetConnection(object): @patch('pganonymize.utils.psycopg2.connect') def test(self, mock_connect): @@ -26,17 +27,20 @@ def test(self, mock_connect): mock_connect.assert_called_once_with(**connection_data) -class TestTruncateTables: +class TestTruncateTables(object): + @patch('psycopg2.extensions.quote_ident', side_effect=quote_ident) + @patch('pganonymize.utils.config') @pytest.mark.parametrize('tables, expected', [ [('table_a', 'table_b', 'CAPS_TABLe'), 'TRUNCATE TABLE "table_a", "table_b", "CAPS_TABLe"'], [(), None], ]) - def test(self, quote_ident, tables, expected): + def test(self, quote_ident, mock_config, tables, expected): mock_cursor = Mock() connection = Mock() connection.cursor.return_value = mock_cursor - truncate_tables(connection, tables) + mock_config.schema = {'truncate': tables} + truncate_tables(connection) if tables: connection.cursor.assert_called_once() assert mock_cursor.execute.call_args_list == [call(expected)] @@ -47,7 +51,7 @@ def test(self, quote_ident, tables, expected): mock_cursor.close.assert_not_called() -class TestImportData: +class TestImportData(object): @patch('psycopg2.extensions.quote_ident', side_effect=quote_ident) @patch('pgcopy.copy.util') @patch('pgcopy.copy.inspect') @@ -84,8 +88,9 @@ def test(self, inspect, util, quote_ident, tmp_table, cols, data): assert mock_cursor.copy_expert.call_args_list == expected @patch('pganonymize.utils.CopyManager') + @patch('pganonymize.utils.config') @patch('psycopg2.extensions.quote_ident', side_effect=quote_ident) - def test_anonymize_tables(self, quote_ident, copy_manager): + def test_anonymize_tables(self, quote_ident, mock_config, copy_manager): mock_cursor = Mock() mock_cursor.fetchone.return_value = [2] mock_cursor.fetchmany.side_effect = [ @@ -102,8 +107,8 @@ def test_anonymize_tables(self, quote_ident, copy_manager): connection = Mock() connection.cursor.return_value = mock_cursor - definitions = [] - anonymize_tables(connection, definitions, verbose=True) + mock_config.schema = {'tables': []} + anonymize_tables(connection, verbose=True) assert connection.cursor.call_count == 0 assert mock_cursor.close.call_count == 0 @@ -148,8 +153,8 @@ def test_anonymize_tables(self, quote_ident, copy_manager): } } ] - - anonymize_tables(connection, definitions, verbose=True) + mock_config.schema = {'tables': definitions} + anonymize_tables(connection, verbose=True) assert connection.cursor.call_count == mock_cursor.close.call_count assert copy_manager.call_args_list == [call(connection, 'tmp_auth_user', ['id', 'first_name', 'json_column'])] assert cmm.copy.call_count == 1 @@ -157,7 +162,7 @@ def test_anonymize_tables(self, quote_ident, copy_manager): ['dummy nameappend-me', b'{"field2": "dummy json field2"}']])] -class TestBuildAndThenImport: +class TestBuildAndThenImport(object): @patch('psycopg2.extensions.quote_ident', side_effect=quote_ident) @patch('pganonymize.utils.CopyManager') @pytest.mark.parametrize('table, primary_key, columns, total_count, chunk_size', [ @@ -228,68 +233,10 @@ def test_column_format(self, copy_manager): assert result == expected -class TestCreateDatabaseDump: +class TestCreateDatabaseDump(object): @patch('pganonymize.utils.subprocess.call') def test(self, mock_call): create_database_dump('/tmp/dump.gz', {'dbname': 'database', 'user': 'foo', 'host': 'localhost', 'port': 5432}) mock_call.assert_called_once_with('pg_dump -Fc -Z 9 -d database -U foo -h localhost -p 5432 -f /tmp/dump.gz', shell=True) - - -class TestConfigLoader: - - @pytest.mark.parametrize('file, envs, expected', [ - [ - './tests/schemes/valid_schema.yml', - {}, - { - 'tables': [ - { - 'auth_user': { - 'primary_key': 'id', - 'chunk_size': 5000, - 'fields': [ - {'first_name': {'provider': {'name': 'fake.first_name'}}}, - {'last_name': {'provider': {'name': 'set', 'value': 'Bar'}}}, - {'email': {'provider': {'name': 'md5'}, 'append': '@localhost'}}, - ], - 'excludes': [ - {'email': ['\\S[^@]*@example\\.com']}, - ] - } - } - ], - 'truncate': ['django_session'] - } - ], - [ - './tests/schemes/schema_with_env_variables.yml', - { - 'TEST_CHUNK_SIZE': '123', - 'TEST_PRIMARY_KEY': 'foo-bar', - 'PRESENT_WORLD_NAME': 'beautiful world', - 'COMPANY_ID': '42', - 'USER_TO_BE_SEARCHED': 'i wanna be forgotten', - }, - { - 'primary_key': 'foo-bar', - 'primary_key2': 'foo-bar', - 'chunk_size': '123', - 'concat_missing': 'Hello, MISSING_ENV_VAL', - 'concat_missing2': 'Hello, ${MISSING_ENV_VAL}', - 'concat_present': 'Hello, beautiful world', - 'concat_present2': 'beautiful world', - 'concat_present3': 'Hello, beautiful world', - 'search': 'id = 42', - 'search2': "username = 'i wanna be forgotten'", - 'corrupted': "username = '${CORRUPTED", - 'corrupted2': '', - 'corrupted3': '$', - } - ] - ]) - def test(self, file, envs, expected): - with patch.dict(os.environ, envs): - print(load_config(file)) - assert load_config(file) == expected From 5664e4d7618d8affa55428174255ef8959a9b5bb Mon Sep 17 00:00:00 2001 From: Timo Steidle Date: Tue, 29 Nov 2022 10:03:44 +0100 Subject: [PATCH 04/15] Add support for faker locales --- pganonymize/providers.py | 22 +++++++++++++++++++++- tests/conftest.py | 11 +++++++++++ tests/test_config.py | 1 - tests/test_providers.py | 30 ++++++++++++++++++++++++------ tests/test_utils.py | 8 ++++---- 5 files changed, 60 insertions(+), 12 deletions(-) diff --git a/pganonymize/providers.py b/pganonymize/providers.py index 1dd65a89..a3f1af02 100644 --- a/pganonymize/providers.py +++ b/pganonymize/providers.py @@ -16,6 +16,7 @@ class FakerInitializer(object): def __init__(self): self._faker = None + self.default_locale = None @property def faker(self): @@ -29,8 +30,24 @@ def faker(self): options = config.schema.get('options', {}) locales = options.get('faker', {}).get('locales', None) self._faker = Faker(locales) + self.default_locale = options.get('faker', {}).get('default_locale', None) return self._faker + def get_locale_generator(self, locale): + """ + Get the internal generator for the given locale. + + :param str locale: A locale string + :raises InvalidProviderArgument: If locale is unknown (not configured within the global locales option). + :return: A Generator instance for the given locale + :rtype: faker.Generator + """ + try: + generator = self.faker[locale] + except KeyError: + raise InvalidProviderArgument('Locale \'{}\' is unknown. Have you added it to the global option ' + '(options.faker.locales)?'.format(locale)) + return generator faker_initializer = FakerInitializer() @@ -141,8 +158,11 @@ class FakeProvider(Provider): def alter_value(cls, original_value, **kwargs): func_name = kwargs['name'].split('.', 1)[1] func_kwargs = kwargs.get('kwargs', {}) + locale = kwargs.get('locale', faker_initializer.default_locale) + # Use the generator for the locale if a locale is configured (per field definition or as global default locale) + faker_generator = faker_initializer.get_locale_generator(locale) if locale else faker_initializer.faker try: - func = operator.attrgetter(func_name)(faker_initializer.faker) + func = operator.attrgetter(func_name)(faker_generator) except AttributeError as exc: raise InvalidProviderArgument(exc) return func(**func_kwargs) diff --git a/tests/conftest.py b/tests/conftest.py index 40a96afc..6b781c90 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1 +1,12 @@ # -*- coding: utf-8 -*- +import pytest +from mock.mock import patch + +from pganonymize.config import config + + +@pytest.fixture +def valid_config(): + # Patch the config instance with a valid schema + with patch.multiple('pganonymize.config.config', schema_file='./tests/schemes/valid_schema.yml', _schema=None): + yield config diff --git a/tests/test_config.py b/tests/test_config.py index 17e4788c..2b6485db 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -58,5 +58,4 @@ ]) def test_load_schema(file, envs, expected): with patch.dict(os.environ, envs): - print(load_schema(file)) assert load_schema(file) == expected diff --git a/tests/test_providers.py b/tests/test_providers.py index 70c24403..555aac2c 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -5,8 +5,10 @@ import pytest import six from mock import MagicMock, Mock, call, patch +from mock.mock import PropertyMock from pganonymize import exceptions, providers +from pganonymize.exceptions import InvalidProviderArgument def test_register(): @@ -114,26 +116,42 @@ def test_alter_value(self): assert provider.alter_value('Foo') is None +@pytest.mark.usefixtures('valid_config') class TestFakeProvider(object): @pytest.mark.parametrize('name, function_name', [ ('fake.first_name', 'first_name'), ('fake.unique.first_name', 'unique.first_name'), ]) - @patch('pganonymize.providers.faker_initializer') - def test_alter_value(self, mock_faker_initializer, name, function_name): + @patch('pganonymize.providers.faker_initializer._faker') + def test_alter_value(self, mock_faker, name, function_name): providers.FakeProvider.alter_value('Foo', name=name) - assert operator.attrgetter(function_name)(mock_faker_initializer.faker).call_count == 1 + assert operator.attrgetter(function_name)(mock_faker).call_count == 1 @pytest.mark.parametrize('name', ['fake.foo_name']) def test_invalid_names(self, name): with pytest.raises(exceptions.InvalidProviderArgument): providers.FakeProvider.alter_value('Foo', name=name) - @patch('pganonymize.providers.faker_initializer') - def test_alter_value_with_kwargs(self, mock_faker_initializer): + @patch('pganonymize.providers.faker_initializer._faker') + def test_alter_value_with_kwargs(self, mock_faker): providers.FakeProvider.alter_value('Foo', name='fake.date_of_birth', kwargs={'minimum_age': 18}) - assert mock_faker_initializer.faker.date_of_birth.call_args == call(minimum_age=18) + assert mock_faker.date_of_birth.call_args == call(minimum_age=18) + + @patch('pganonymize.providers.faker_initializer._faker') + def test_alter_value_with_locale(self, mock_faker): + providers.FakeProvider.alter_value('Foo', name='fake.date_of_birth', locale='de_DE') + assert mock_faker['de_DE'].date_of_birth.call_count == 1 + + def test_alter_value_with_unkown_locale(self): + with pytest.raises(InvalidProviderArgument): + providers.FakeProvider.alter_value('Foo', name='fake.date_of_birth', locale='de_DE') + + @patch('pganonymize.providers.faker_initializer._faker') + def test_alter_value_use_default_locale(self, mock_faker): + with patch('pganonymize.providers.faker_initializer.default_locale', new=PropertyMock(return_value='en_US')): + providers.FakeProvider.alter_value('Foo', name='fake.date_of_birth') + assert mock_faker['en_US'].date_of_birth.call_count == 1 class TestMaskProvider(object): diff --git a/tests/test_utils.py b/tests/test_utils.py index 9a2e3fb9..6d207bcb 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -3,6 +3,7 @@ import pytest from mock import ANY, Mock, call, patch +from mock.mock import PropertyMock from tests.utils import quote_ident @@ -30,17 +31,16 @@ def test(self, mock_connect): class TestTruncateTables(object): @patch('psycopg2.extensions.quote_ident', side_effect=quote_ident) - @patch('pganonymize.utils.config') @pytest.mark.parametrize('tables, expected', [ [('table_a', 'table_b', 'CAPS_TABLe'), 'TRUNCATE TABLE "table_a", "table_b", "CAPS_TABLe"'], [(), None], ]) - def test(self, quote_ident, mock_config, tables, expected): + def test(self, quote_ident, tables, expected): mock_cursor = Mock() connection = Mock() connection.cursor.return_value = mock_cursor - mock_config.schema = {'truncate': tables} - truncate_tables(connection) + with patch.multiple('pganonymize.config.config', schema_file=None, _schema={'truncate': tables}): + truncate_tables(connection) if tables: connection.cursor.assert_called_once() assert mock_cursor.execute.call_args_list == [call(expected)] From 07effc0f39c6198d85ebf3145ace78ee17048887 Mon Sep 17 00:00:00 2001 From: Timo Steidle Date: Tue, 29 Nov 2022 10:20:13 +0100 Subject: [PATCH 05/15] Add a default_locale option for the faker provider --- pganonymize/providers.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/pganonymize/providers.py b/pganonymize/providers.py index a3f1af02..cbc4c866 100644 --- a/pganonymize/providers.py +++ b/pganonymize/providers.py @@ -16,7 +16,17 @@ class FakerInitializer(object): def __init__(self): self._faker = None - self.default_locale = None + self._options = None + + @property + def options(self): + if self._options is None: + self._options = config.schema.get('options', {}).get('faker', {}) + return self._options + + @property + def default_locale(self): + return self.options.get('default_locale') @property def faker(self): @@ -27,10 +37,8 @@ def faker(self): :rtype: faker.Faker """ if self._faker is None: - options = config.schema.get('options', {}) - locales = options.get('faker', {}).get('locales', None) + locales = self.options.get('locales') self._faker = Faker(locales) - self.default_locale = options.get('faker', {}).get('default_locale', None) return self._faker def get_locale_generator(self, locale): @@ -49,6 +57,7 @@ def get_locale_generator(self, locale): '(options.faker.locales)?'.format(locale)) return generator + faker_initializer = FakerInitializer() From d0513f0903b5b96712ca19e5a3e16a234920470f Mon Sep 17 00:00:00 2001 From: Timo Steidle Date: Tue, 29 Nov 2022 11:07:08 +0100 Subject: [PATCH 06/15] Add and fix tests --- tests/conftest.py | 15 +++++++++++++++ tests/test_providers.py | 15 +++++++++------ tests/test_utils.py | 1 - 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 6b781c90..958038a8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,6 +3,7 @@ from mock.mock import patch from pganonymize.config import config +from pganonymize.providers import faker_initializer @pytest.fixture @@ -10,3 +11,17 @@ def valid_config(): # Patch the config instance with a valid schema with patch.multiple('pganonymize.config.config', schema_file='./tests/schemes/valid_schema.yml', _schema=None): yield config + + +@pytest.fixture +def mocked_faker_initializer(): + # Patch the faker_initializer instance with a Faker mock + with patch('pganonymize.providers.faker_initializer._faker'): + yield faker_initializer + + +@pytest.fixture +def faker_initializer_with_localization(mocked_faker_initializer): + # Patch the faker_initializer instance with localization options + with patch.object(mocked_faker_initializer, '_options', {'locales': ('de_DE', 'en_US'), 'default_locale': 'en_US'}): + yield mocked_faker_initializer diff --git a/tests/test_providers.py b/tests/test_providers.py index 555aac2c..45233e2f 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -5,7 +5,6 @@ import pytest import six from mock import MagicMock, Mock, call, patch -from mock.mock import PropertyMock from pganonymize import exceptions, providers from pganonymize.exceptions import InvalidProviderArgument @@ -147,11 +146,15 @@ def test_alter_value_with_unkown_locale(self): with pytest.raises(InvalidProviderArgument): providers.FakeProvider.alter_value('Foo', name='fake.date_of_birth', locale='de_DE') - @patch('pganonymize.providers.faker_initializer._faker') - def test_alter_value_use_default_locale(self, mock_faker): - with patch('pganonymize.providers.faker_initializer.default_locale', new=PropertyMock(return_value='en_US')): - providers.FakeProvider.alter_value('Foo', name='fake.date_of_birth') - assert mock_faker['en_US'].date_of_birth.call_count == 1 + def test_alter_value_use_default_locale(self, faker_initializer_with_localization): + providers.FakeProvider.alter_value('Foo', name='fake.date_of_birth') + faker = faker_initializer_with_localization._faker + assert faker[faker_initializer_with_localization.default_locale].date_of_birth.call_count == 1 + + def test_alter_value_ignore_default_locale(self, faker_initializer_with_localization): + providers.FakeProvider.alter_value('Foo', name='fake.date_of_birth', locale=None) + faker = faker_initializer_with_localization._faker + assert faker.date_of_birth.call_count == 1 class TestMaskProvider(object): diff --git a/tests/test_utils.py b/tests/test_utils.py index 6d207bcb..ac5cdc50 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -3,7 +3,6 @@ import pytest from mock import ANY, Mock, call, patch -from mock.mock import PropertyMock from tests.utils import quote_ident From 3a9bcf5d416a835d85008e32f277775c729582e8 Mon Sep 17 00:00:00 2001 From: Timo Steidle Date: Tue, 29 Nov 2022 12:11:12 +0100 Subject: [PATCH 07/15] Add docs --- docs/index.rst | 1 + docs/localization.rst | 71 +++++++++++++++++++++++++++++++++++++++++++ docs/schema.rst | 2 ++ 3 files changed, 74 insertions(+) create mode 100644 docs/localization.rst diff --git a/docs/index.rst b/docs/index.rst index 091dd017..e436d9da 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -8,6 +8,7 @@ Contents: readme schema + localization api tests documentation diff --git a/docs/localization.rst b/docs/localization.rst new file mode 100644 index 00000000..3d15e3f5 --- /dev/null +++ b/docs/localization.rst @@ -0,0 +1,71 @@ +Localization +============ + +It's possible to use the localization feature of ``Faker`` to localize the generated data. + +To localize the data, add the locales to use as a global option to the YAML schema: + +.. code-block:: yaml + + tables: + auth_user: + fields: + - name: + provider: + name: fake.name + - street: + provider: + name: fake.street_address + - city: + provider: + name: fake.city + + options: + faker: + locales: + - de_DE + - en_US + +Now any field using the ``Faker`` provider will generate localized data. When multiple locales are configured, ``Faker`` +will use its `Multiple Locale Mode `_. +In the example above, ``Faker`` selects the locale randomly for each field and row. + +It's also possible to define the locale to use on field level and to define a default locale: + +.. code-block:: yaml + + tables: + - user: + primary_key: id + fields: + - name: + provider: + # No locale entry at all, use configured default_locale "de_DE" + name: fake.name + - city: + provider: + # Use "en_US" + name: fake.city + locale: en_US + - street: + provider: + # Use "cs_CZ" + name: fake.street_address + locale: cs_CZ + - zipcode: + provider: + # Use empty locale to ignore default_locale and to randomly select locale + name: fake.postcode + locale: + + options: + faker: + locales: + - de_DE + - en_US + - cs_CZ + default_locale: de_DE + +.. ATTENTION:: + Make sure that the ``Faker`` provider (e.g. ``street_name``) is supported by the + `Localized Provider `_. diff --git a/docs/schema.rst b/docs/schema.rst index b726a2c0..b8a99b3e 100644 --- a/docs/schema.rst +++ b/docs/schema.rst @@ -274,6 +274,8 @@ with ``fake`` and then use the function name from the Faker library, e.g: Some fake functions allow additional parameters to be passed, these can be specified in the schema as ``kwargs``. +For localization options see :doc:`localization`. + .. note:: Please note: using the ``Faker`` library will generate randomly generated data for each data row within a table. This will dramatically slow down the anonymization process. From 2761e47cf0fcf01576de9a8c9f3aeca5ba0369e9 Mon Sep 17 00:00:00 2001 From: Timo Steidle Date: Tue, 29 Nov 2022 12:42:58 +0100 Subject: [PATCH 08/15] Add docs --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f1f2e60a..696cb5f3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ ## Development +* [#47](https://github.com/rheinwerk-verlag/pganonymize/pull/47): Add support for localized "Faker" data + ## 0.9.0 (2022-11-23) * [#46](https://github.com/rheinwerk-verlag/pganonymize/pull/46): Broken Python 2.7 compatibility From 4a29417cf067eb2ff02dc116c1f56222d8688b87 Mon Sep 17 00:00:00 2001 From: Timo Steidle Date: Tue, 29 Nov 2022 12:49:22 +0100 Subject: [PATCH 09/15] Configure psycopg2 to support UUID objects --- pganonymize/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pganonymize/utils.py b/pganonymize/utils.py index f0b31e8a..e5730818 100644 --- a/pganonymize/utils.py +++ b/pganonymize/utils.py @@ -21,6 +21,9 @@ from pganonymize.constants import DEFAULT_CHUNK_SIZE, DEFAULT_PRIMARY_KEY from pganonymize.providers import provider_registry +# Needed to work with UUID objects +psycopg2.extras.register_uuid() + def anonymize_tables(connection, definitions, verbose=False, dry_run=False): """ From a76083756ff9cbb57220d7f06358555165fd8476 Mon Sep 17 00:00:00 2001 From: Henning Kage Date: Tue, 29 Nov 2022 12:53:29 +0100 Subject: [PATCH 10/15] Changed README and CHANGELOG to reStructuredText (no more recommonmark requirement necessary) --- CHANGELOG.md => CHANGELOG.rst | 0 CONTRIBUTING.rst | 4 ++-- README.rst | 14 +++++++------- docs/changelog.md | 1 - docs/changelog.rst | 1 + docs/conf.py | 10 +++------- docs/readme.md | 1 - 7 files changed, 13 insertions(+), 18 deletions(-) rename CHANGELOG.md => CHANGELOG.rst (100%) delete mode 120000 docs/changelog.md create mode 100644 docs/changelog.rst delete mode 120000 docs/readme.md diff --git a/CHANGELOG.md b/CHANGELOG.rst similarity index 100% rename from CHANGELOG.md rename to CHANGELOG.rst diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index c7640480..ed67cb9b 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -17,14 +17,14 @@ Making changes Create a fork if you want to make changes or clone the repo if you want a readonly access to the current development version: -.. code-block:: sh +.. code-block:: bash $ git clone git@github.com:rheinwerk-verlag/postgresql-anonymizer.git $ cd postgresql-anonymizer For the development use a virtualenv or install the requirements directly: -.. code-block:: sh +.. code-block:: bash $ sudo pip install -r requirements.txt diff --git a/README.rst b/README.rst index 516ef0df..d06acd2e 100644 --- a/README.rst +++ b/README.rst @@ -56,14 +56,14 @@ Installation The default installation method is to use ``pip``: -.. code-block:: sh +.. code-block:: $ pip install pganonymize Usage ----- -.. code-block:: sh +.. code-block:: usage: pganonymize [-h] [-v] [-l] [--schema SCHEMA] [--dbname DBNAME] [--user USER] [--password PASSWORD] [--host HOST] @@ -94,7 +94,7 @@ all anonymization rules for that database. Take a look at the `schema documentat Example calls: -.. code-block:: sh +.. code-block:: $ pganonymize --schema=myschema.yml \ --dbname=test_database \ @@ -118,13 +118,13 @@ With the ``--dump-file`` argument it is possible to create a dump file after ano that the ``pg_dump`` command from the ``postgresql-client-common`` library is necessary to create the dump file for the database, e.g. under Linux: -.. code-block:: sh +.. code-block:: $ sudo apt-get install postgresql-client-common Example call: -.. code-block:: sh +.. code-block:: $ pganonymize --schema=myschema.yml \ --dbname=test_database \ @@ -139,13 +139,13 @@ Docker If you want to run the anonymizer within a Docker container you first have to build the image: -.. code-block:: sh +.. code-block:: $ docker build -t pganonymize . After that you can pass a schema file to the container, using Docker volumes, and call the anonymizer: -.. code-block:: sh +.. code-block:: $ docker run \ -v :/schema.yml \ diff --git a/docs/changelog.md b/docs/changelog.md deleted file mode 120000 index 04c99a55..00000000 --- a/docs/changelog.md +++ /dev/null @@ -1 +0,0 @@ -../CHANGELOG.md \ No newline at end of file diff --git a/docs/changelog.rst b/docs/changelog.rst new file mode 100644 index 00000000..565b0521 --- /dev/null +++ b/docs/changelog.rst @@ -0,0 +1 @@ +.. include:: ../CHANGELOG.rst diff --git a/docs/conf.py b/docs/conf.py index 4f535cdd..cf7f31dc 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -15,8 +15,6 @@ import sys import os -from recommonmark.parser import CommonMarkParser - # If extensions (or modules to document with autodoc) are in another # directory, add these directories to sys.path here. If the directory is # relative to the documentation root, use os.path.abspath to make it @@ -47,12 +45,10 @@ # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] -source_parsers = { - '.md': CommonMarkParser, -} +#source_parsers = {} # The suffix of source filenames. -source_suffix = ['.rst', '.md'] +source_suffix = ['.rst'] # The encoding of source files. #source_encoding = 'utf-8-sig' @@ -148,7 +144,7 @@ # here, relative to this directory. They are copied after the builtin # static files, so a file named "default.css" will overwrite the builtin # "default.css". -#html_static_path = ['_static'] +html_static_path = ['_static'] # If not '', a 'Last updated on:' timestamp is inserted at every page # bottom, using the given strftime format. diff --git a/docs/readme.md b/docs/readme.md deleted file mode 120000 index 32d46ee8..00000000 --- a/docs/readme.md +++ /dev/null @@ -1 +0,0 @@ -../README.md \ No newline at end of file From 3357338816c20043a16dd469dea7e1ca785436d7 Mon Sep 17 00:00:00 2001 From: Timo Steidle Date: Tue, 29 Nov 2022 12:57:01 +0100 Subject: [PATCH 11/15] Adjust CHANGELOG --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 696cb5f3..79aa12c0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,7 @@ ## Development -* [#47](https://github.com/rheinwerk-verlag/pganonymize/pull/47): Add support for localized "Faker" data +* [#48](https://github.com/rheinwerk-verlag/pganonymize/pull/48): Add support for localized "Faker" data ## 0.9.0 (2022-11-23) From 931a0fd8d4422cc149508fa174e34c0825c5b793 Mon Sep 17 00:00:00 2001 From: Henning Kage Date: Tue, 29 Nov 2022 13:00:37 +0100 Subject: [PATCH 12/15] #47: Fixed changelog headings (and pull request link) --- CHANGELOG.rst | 63 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 42 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 79aa12c0..5b6ad4a8 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,21 +1,26 @@ -# Changelog +Changelog +========= -## Development +Development +----------- * [#48](https://github.com/rheinwerk-verlag/pganonymize/pull/48): Add support for localized "Faker" data -## 0.9.0 (2022-11-23) +0.9.0 (2022-11-23) +------------------ * [#46](https://github.com/rheinwerk-verlag/pganonymize/pull/46): Broken Python 2.7 compatibility * [#45](https://github.com/rheinwerk-verlag/pganonymize/pull/45): Add partial masked provider ([Tilley](https://github.com/Tilley/)) * [#44](https://github.com/rheinwerk-verlag/pganonymize/pull/44): Pass kwargs through to faker functions from schema ([Tilley](https://github.com/Tilley/)) -## 0.8.0 (2022-03-15) +0.8.0 (2022-03-15) +------------------ * [#39](https://github.com/rheinwerk-verlag/pganonymize/issues/39): Renamed project to "pganonymize" * [#38](https://github.com/rheinwerk-verlag/pganonymize/pull/38): Allow environment variables in schema definition ([nurikk](https://github.com/nurikk)) -## 0.7.0 (2021-11-30) +0.7.0 (2021-11-30) +------------------ * [#34](https://github.com/rheinwerk-verlag/pganonymize/issues/34): Subprocess "run" being used on Python2.7 * [#35](https://github.com/rheinwerk-verlag/pganonymize/issues/35): parmap no longer supports Python 2.7 @@ -24,80 +29,96 @@ * [#32](https://github.com/rheinwerk-verlag/pganonymize/pull/32): Fixed pg_dump arguments ([korsar182](https://github.com/korsar182)) * Simplified provider registration (no metaclass usage anymore) -## 0.6.1 (2021-07-13) +0.6.1 (2021-07-13) +------------------ * Added missing dependencies for the `setup.py` -## 0.6.0 (2021-07-13) +0.6.0 (2021-07-13) +------------------ * [#28](https://github.com/rheinwerk-verlag/pganonymize/pull/25): Add json support ([nurikk](https://github.com/nurikk)) * [#27](https://github.com/rheinwerk-verlag/pganonymize/pull/25): Better anonymisation ([nurikk](https://github.com/nurikk)) * [#25](https://github.com/rheinwerk-verlag/pganonymize/pull/25): Remove column specification for `cursor.copy_from` call ([nurikk](https://github.com/nurikk)) -## 0.5.0 (2021-06-30) +0.5.0 (2021-06-30) +------------------ * [#22](https://github.com/rheinwerk-verlag/pganonymize/pull/22): Fix table and column name quotes in `cursor.copy_from` call ([nurikk](https://github.com/nurikk)) * [#23](https://github.com/rheinwerk-verlag/pganonymize/pull/23): Allow uniq faker ([nurikk](https://github.com/nurikk)) -## 0.4.1 (2021-05-27) +0.4.1 (2021-05-27) +------------------ * [#19](https://github.com/rheinwerk-verlag/pganonymize/pull/19): Make chunk size in the table definition dynamic ([halilkaya](https://github.com/halilkaya)) -## 0.4.0 (2021-05-05) +0.4.0 (2021-05-05) +------------------ * [#18](https://github.com/rheinwerk-verlag/pganonymize/pull/18): Specify (SQL WHERE) search_condition, to filter the table for rows to be anonymized ([bobslee](https://github.com/bobslee)) * [#17](https://github.com/rheinwerk-verlag/pganonymize/pull/17): Fix anonymizing error if there is a JSONB column in a table ([koptelovav](https://github.com/koptelovav)) -## 0.3.3 (2021-04-16) +0.3.3 (2021-04-16) +------------------ * [#16](https://github.com/rheinwerk-verlag/pganonymize/issues/16): Preserve column and table cases during the copy process -## 0.3.2 (2021-01-25) +0.3.2 (2021-01-25) +------------------ * [#15](https://github.com/rheinwerk-verlag/pganonymize/pull/15): Fix for exclude bug ([abhinavvaidya90](https://github.com/abhinavvaidya90)) -## 0.3.1 (2020-12-04) +0.3.1 (2020-12-04) +------------------ * [#13](https://github.com/rheinwerk-verlag/pganonymize/pull/13): Fixed a syntax error if no truncated tables are defined ([ray-man](https://github.com/ray-man)) -## 0.3.0 (2020-02-11) +0.3.0 (2020-02-11) +------------------ * Use [python-poetry](https://github.com/python-poetry/poetry) for requirements management * Added commandline argument to list all available providers (#4) * Added commandline argument to create a dump file (#5) * Execute table truncation in one statement to avoid foreign key constraint errors (thanks to [W1ldPo1nter](https://github.com/W1ldPo1nter)) -## 0.2.4 (2020-01-03) +0.2.4 (2020-01-03) +------------------ * Fixed several issues with the usage of ``dict.keys`` and Python 3 -## 0.2.3 (2020-01-02) +0.2.3 (2020-01-02) +------------------ * Fixed the wrong cStringIO import for Python 3 * Removed Travis-CI file in favor of the Github actions -## 0.2.2 (2020-01-02) +0.2.2 (2020-01-02) +------------------ * Hide the progressbar completely if verbose is set to ``False`` * Restructured the requirement files and added flake8 to Travis CI -## 0.2.1 (2019-12-20) +0.2.1 (2019-12-20) +------------------ * Added field based, regular expression excludes (to skip data under certain conditions). Currently only regular expressions are supported and the exclusion affects the whole row, not just one single column. -## 0.2.0 (2019-12-20) +0.2.0 (2019-12-20) +------------------ * Added provider classes * Added new providers: * choice - returns a random list element * mask - replaces the original value with a static sign -## 0.1.1 (2019-12-18) +0.1.1 (2019-12-18) +------------------ Changed setup.py -## 0.1.0 (2019-12-16) +0.1.0 (2019-12-16) +------------------ Initial release of the prototype From ad5f9ea26e6faa45f9c1e9a502406e74d1ca80a0 Mon Sep 17 00:00:00 2001 From: Henning Kage Date: Tue, 29 Nov 2022 13:18:11 +0100 Subject: [PATCH 13/15] #47: Added missing docstring parameter --- pganonymize/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pganonymize/config.py b/pganonymize/config.py index 1fed7adc..08799960 100644 --- a/pganonymize/config.py +++ b/pganonymize/config.py @@ -16,8 +16,8 @@ def schema(self): """ Return the schema loaded from the given YAML schema file. - :return: - :rtype: + :return: The parsed YAML schema. + :rtype: dict """ if self._schema is None and self.schema_file is not None: self._schema = load_schema(self.schema_file) From c609fb7d17df3d206a13ea82bfda7c3b52acaad5 Mon Sep 17 00:00:00 2001 From: Timo Steidle Date: Tue, 29 Nov 2022 13:37:24 +0100 Subject: [PATCH 14/15] Fix CHANGELOG --- CHANGELOG.rst | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 5b6ad4a8..c6c0c3b0 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -4,29 +4,30 @@ Changelog Development ----------- -* [#48](https://github.com/rheinwerk-verlag/pganonymize/pull/48): Add support for localized "Faker" data +* `#49 `_: Configure psycopg2 to support UUID objects +* `#48 `_: Add support for localized "Faker" data 0.9.0 (2022-11-23) ------------------ -* [#46](https://github.com/rheinwerk-verlag/pganonymize/pull/46): Broken Python 2.7 compatibility -* [#45](https://github.com/rheinwerk-verlag/pganonymize/pull/45): Add partial masked provider ([Tilley](https://github.com/Tilley/)) -* [#44](https://github.com/rheinwerk-verlag/pganonymize/pull/44): Pass kwargs through to faker functions from schema ([Tilley](https://github.com/Tilley/)) +* `#46 `_: Broken Python 2.7 compatibility +* `#45 `_: Add partial masked provider (`Tilley `_) +* `#44 `_: Pass kwargs through to faker functions from schema (`Tilley `_) 0.8.0 (2022-03-15) ------------------ -* [#39](https://github.com/rheinwerk-verlag/pganonymize/issues/39): Renamed project to "pganonymize" -* [#38](https://github.com/rheinwerk-verlag/pganonymize/pull/38): Allow environment variables in schema definition ([nurikk](https://github.com/nurikk)) +* `#39 `_: Renamed project to "pganonymize" +* `#38 `_: Allow environment variables in schema definition (`nurikk `_) 0.7.0 (2021-11-30) ------------------ -* [#34](https://github.com/rheinwerk-verlag/pganonymize/issues/34): Subprocess "run" being used on Python2.7 -* [#35](https://github.com/rheinwerk-verlag/pganonymize/issues/35): parmap no longer supports Python 2.7 +* `#34 `_: Subprocess "run" being used on Python2.7 +* `#35 `_: parmap no longer supports Python 2.7 * Dropped Python 3.5 support * Pinned libraries Python 2.7 -* [#32](https://github.com/rheinwerk-verlag/pganonymize/pull/32): Fixed pg_dump arguments ([korsar182](https://github.com/korsar182)) +* `#32 `_: Fixed pg_dump arguments (`korsar182 `_) * Simplified provider registration (no metaclass usage anymore) 0.6.1 (2021-07-13) @@ -37,49 +38,49 @@ Development 0.6.0 (2021-07-13) ------------------ -* [#28](https://github.com/rheinwerk-verlag/pganonymize/pull/25): Add json support ([nurikk](https://github.com/nurikk)) -* [#27](https://github.com/rheinwerk-verlag/pganonymize/pull/25): Better anonymisation ([nurikk](https://github.com/nurikk)) -* [#25](https://github.com/rheinwerk-verlag/pganonymize/pull/25): Remove column specification for `cursor.copy_from` call ([nurikk](https://github.com/nurikk)) +* `#28 `_: Add json support (`nurikk `_) +* `#27 `_: Better anonymisation (`nurikk `_) +* `#25 `_: Remove column specification for `cursor.copy_from` call (`nurikk `_) 0.5.0 (2021-06-30) ------------------ -* [#22](https://github.com/rheinwerk-verlag/pganonymize/pull/22): Fix table and column name quotes in `cursor.copy_from` call ([nurikk](https://github.com/nurikk)) -* [#23](https://github.com/rheinwerk-verlag/pganonymize/pull/23): Allow uniq faker ([nurikk](https://github.com/nurikk)) +* `#22 `_: Fix table and column name quotes in `cursor.copy_from` call (`nurikk `_) +* `#23 `_: Allow uniq faker (`nurikk `_) 0.4.1 (2021-05-27) ------------------ -* [#19](https://github.com/rheinwerk-verlag/pganonymize/pull/19): Make chunk size in the table definition dynamic ([halilkaya](https://github.com/halilkaya)) +* `#19 `_: Make chunk size in the table definition dynamic (`halilkaya `_) 0.4.0 (2021-05-05) ------------------ -* [#18](https://github.com/rheinwerk-verlag/pganonymize/pull/18): Specify (SQL WHERE) search_condition, to filter the table for rows to be anonymized ([bobslee](https://github.com/bobslee)) -* [#17](https://github.com/rheinwerk-verlag/pganonymize/pull/17): Fix anonymizing error if there is a JSONB column in a table ([koptelovav](https://github.com/koptelovav)) +* `#18 `_: Specify (SQL WHERE) search_condition, to filter the table for rows to be anonymized (`bobslee `_) +* `#17 `_: Fix anonymizing error if there is a JSONB column in a table (`koptelovav `_) 0.3.3 (2021-04-16) ------------------ -* [#16](https://github.com/rheinwerk-verlag/pganonymize/issues/16): Preserve column and table cases during the copy process +* `#16 `_: Preserve column and table cases during the copy process 0.3.2 (2021-01-25) ------------------ -* [#15](https://github.com/rheinwerk-verlag/pganonymize/pull/15): Fix for exclude bug ([abhinavvaidya90](https://github.com/abhinavvaidya90)) +* `#15 `_: Fix for exclude bug (`abhinavvaidya90 `_) 0.3.1 (2020-12-04) ------------------ -* [#13](https://github.com/rheinwerk-verlag/pganonymize/pull/13): Fixed a syntax error if no truncated tables are defined ([ray-man](https://github.com/ray-man)) +* `#13 `_: Fixed a syntax error if no truncated tables are defined (`ray-man `_) 0.3.0 (2020-02-11) ------------------ -* Use [python-poetry](https://github.com/python-poetry/poetry) for requirements management +* Use `python-poetry `_ for requirements management * Added commandline argument to list all available providers (#4) * Added commandline argument to create a dump file (#5) -* Execute table truncation in one statement to avoid foreign key constraint errors (thanks to [W1ldPo1nter](https://github.com/W1ldPo1nter)) +* Execute table truncation in one statement to avoid foreign key constraint errors (thanks to `W1ldPo1nter `_) 0.2.4 (2020-01-03) ------------------ From 1b33b7d2ee5bf414226ab97b7879497e79e2e31e Mon Sep 17 00:00:00 2001 From: Henning Kage Date: Tue, 29 Nov 2022 13:54:27 +0100 Subject: [PATCH 15/15] Release 0.10.0 --- CHANGELOG.rst | 3 +++ pganonymize/version.py | 2 +- setup.py | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index c6c0c3b0..057454f3 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -4,6 +4,9 @@ Changelog Development ----------- +0.10.0 (2022-11-29) +------------------- + * `#49 `_: Configure psycopg2 to support UUID objects * `#48 `_: Add support for localized "Faker" data diff --git a/pganonymize/version.py b/pganonymize/version.py index d4524379..a8cad440 100644 --- a/pganonymize/version.py +++ b/pganonymize/version.py @@ -1,3 +1,3 @@ # -*- coding: utf-8 -*- -__version__ = '0.9.0' +__version__ = '0.10.0' diff --git a/setup.py b/setup.py index aaa98d55..b83774ee 100755 --- a/setup.py +++ b/setup.py @@ -76,7 +76,7 @@ def run(self): url='https://github.com/rheinwerk-verlag/pganonymize', license='MIT license', classifiers=[ - 'Development Status :: 3 - Alpha', + 'Development Status :: 4 - Beta', 'Environment :: Console', 'Intended Audience :: Developers', 'License :: OSI Approved :: MIT License',