diff --git a/CHANGELOG.md b/CHANGELOG.md deleted file mode 100644 index f1f2e60..0000000 --- a/CHANGELOG.md +++ /dev/null @@ -1,101 +0,0 @@ -# Changelog - -## Development - -## 0.9.0 (2022-11-23) - -* [#46](https://github.com/rheinwerk-verlag/pganonymize/pull/46): Broken Python 2.7 compatibility -* [#45](https://github.com/rheinwerk-verlag/pganonymize/pull/45): Add partial masked provider ([Tilley](https://github.com/Tilley/)) -* [#44](https://github.com/rheinwerk-verlag/pganonymize/pull/44): Pass kwargs through to faker functions from schema ([Tilley](https://github.com/Tilley/)) - -## 0.8.0 (2022-03-15) - -* [#39](https://github.com/rheinwerk-verlag/pganonymize/issues/39): Renamed project to "pganonymize" -* [#38](https://github.com/rheinwerk-verlag/pganonymize/pull/38): Allow environment variables in schema definition ([nurikk](https://github.com/nurikk)) - -## 0.7.0 (2021-11-30) - -* [#34](https://github.com/rheinwerk-verlag/pganonymize/issues/34): Subprocess "run" being used on Python2.7 -* [#35](https://github.com/rheinwerk-verlag/pganonymize/issues/35): parmap no longer supports Python 2.7 - * Dropped Python 3.5 support - * Pinned libraries Python 2.7 -* [#32](https://github.com/rheinwerk-verlag/pganonymize/pull/32): Fixed pg_dump arguments ([korsar182](https://github.com/korsar182)) -* Simplified provider registration (no metaclass usage anymore) - -## 0.6.1 (2021-07-13) - -* Added missing dependencies for the `setup.py` - -## 0.6.0 (2021-07-13) - -* [#28](https://github.com/rheinwerk-verlag/pganonymize/pull/25): Add json support ([nurikk](https://github.com/nurikk)) -* [#27](https://github.com/rheinwerk-verlag/pganonymize/pull/25): Better anonymisation ([nurikk](https://github.com/nurikk)) -* [#25](https://github.com/rheinwerk-verlag/pganonymize/pull/25): Remove column specification for `cursor.copy_from` call ([nurikk](https://github.com/nurikk)) - -## 0.5.0 (2021-06-30) - -* [#22](https://github.com/rheinwerk-verlag/pganonymize/pull/22): Fix table and column name quotes in `cursor.copy_from` call ([nurikk](https://github.com/nurikk)) -* [#23](https://github.com/rheinwerk-verlag/pganonymize/pull/23): Allow uniq faker ([nurikk](https://github.com/nurikk)) - -## 0.4.1 (2021-05-27) - -* [#19](https://github.com/rheinwerk-verlag/pganonymize/pull/19): Make chunk size in the table definition dynamic ([halilkaya](https://github.com/halilkaya)) - -## 0.4.0 (2021-05-05) - -* [#18](https://github.com/rheinwerk-verlag/pganonymize/pull/18): Specify (SQL WHERE) search_condition, to filter the table for rows to be anonymized ([bobslee](https://github.com/bobslee)) -* [#17](https://github.com/rheinwerk-verlag/pganonymize/pull/17): Fix anonymizing error if there is a JSONB column in a table ([koptelovav](https://github.com/koptelovav)) - -## 0.3.3 (2021-04-16) - -* [#16](https://github.com/rheinwerk-verlag/pganonymize/issues/16): Preserve column and table cases during the copy process - -## 0.3.2 (2021-01-25) - -* [#15](https://github.com/rheinwerk-verlag/pganonymize/pull/15): Fix for exclude bug ([abhinavvaidya90](https://github.com/abhinavvaidya90)) - -## 0.3.1 (2020-12-04) - -* [#13](https://github.com/rheinwerk-verlag/pganonymize/pull/13): Fixed a syntax error if no truncated tables are defined ([ray-man](https://github.com/ray-man)) - -## 0.3.0 (2020-02-11) - -* Use [python-poetry](https://github.com/python-poetry/poetry) for requirements management -* Added commandline argument to list all available providers (#4) -* Added commandline argument to create a dump file (#5) -* Execute table truncation in one statement to avoid foreign key constraint errors (thanks to [W1ldPo1nter](https://github.com/W1ldPo1nter)) - -## 0.2.4 (2020-01-03) - -* Fixed several issues with the usage of ``dict.keys`` and Python 3 - -## 0.2.3 (2020-01-02) - -* Fixed the wrong cStringIO import for Python 3 -* Removed Travis-CI file in favor of the Github actions - -## 0.2.2 (2020-01-02) - -* Hide the progressbar completely if verbose is set to ``False`` -* Restructured the requirement files and added flake8 to Travis CI - -## 0.2.1 (2019-12-20) - -* Added field based, regular expression excludes (to skip data under certain conditions). - Currently only regular expressions are supported and the exclusion affects the whole row, - not just one single column. - -## 0.2.0 (2019-12-20) - -* Added provider classes -* Added new providers: - * choice - returns a random list element - * mask - replaces the original value with a static sign - -## 0.1.1 (2019-12-18) - -Changed setup.py - -## 0.1.0 (2019-12-16) - -Initial release of the prototype diff --git a/CHANGELOG.rst b/CHANGELOG.rst new file mode 100644 index 0000000..057454f --- /dev/null +++ b/CHANGELOG.rst @@ -0,0 +1,128 @@ +Changelog +========= + +Development +----------- + +0.10.0 (2022-11-29) +------------------- + +* `#49 `_: Configure psycopg2 to support UUID objects +* `#48 `_: Add support for localized "Faker" data + +0.9.0 (2022-11-23) +------------------ + +* `#46 `_: Broken Python 2.7 compatibility +* `#45 `_: Add partial masked provider (`Tilley `_) +* `#44 `_: Pass kwargs through to faker functions from schema (`Tilley `_) + +0.8.0 (2022-03-15) +------------------ + +* `#39 `_: Renamed project to "pganonymize" +* `#38 `_: Allow environment variables in schema definition (`nurikk `_) + +0.7.0 (2021-11-30) +------------------ + +* `#34 `_: Subprocess "run" being used on Python2.7 +* `#35 `_: parmap no longer supports Python 2.7 + * Dropped Python 3.5 support + * Pinned libraries Python 2.7 +* `#32 `_: Fixed pg_dump arguments (`korsar182 `_) +* Simplified provider registration (no metaclass usage anymore) + +0.6.1 (2021-07-13) +------------------ + +* Added missing dependencies for the `setup.py` + +0.6.0 (2021-07-13) +------------------ + +* `#28 `_: Add json support (`nurikk `_) +* `#27 `_: Better anonymisation (`nurikk `_) +* `#25 `_: Remove column specification for `cursor.copy_from` call (`nurikk `_) + +0.5.0 (2021-06-30) +------------------ + +* `#22 `_: Fix table and column name quotes in `cursor.copy_from` call (`nurikk `_) +* `#23 `_: Allow uniq faker (`nurikk `_) + +0.4.1 (2021-05-27) +------------------ + +* `#19 `_: Make chunk size in the table definition dynamic (`halilkaya `_) + +0.4.0 (2021-05-05) +------------------ + +* `#18 `_: Specify (SQL WHERE) search_condition, to filter the table for rows to be anonymized (`bobslee `_) +* `#17 `_: Fix anonymizing error if there is a JSONB column in a table (`koptelovav `_) + +0.3.3 (2021-04-16) +------------------ + +* `#16 `_: Preserve column and table cases during the copy process + +0.3.2 (2021-01-25) +------------------ + +* `#15 `_: Fix for exclude bug (`abhinavvaidya90 `_) + +0.3.1 (2020-12-04) +------------------ + +* `#13 `_: Fixed a syntax error if no truncated tables are defined (`ray-man `_) + +0.3.0 (2020-02-11) +------------------ + +* Use `python-poetry `_ for requirements management +* Added commandline argument to list all available providers (#4) +* Added commandline argument to create a dump file (#5) +* Execute table truncation in one statement to avoid foreign key constraint errors (thanks to `W1ldPo1nter `_) + +0.2.4 (2020-01-03) +------------------ + +* Fixed several issues with the usage of ``dict.keys`` and Python 3 + +0.2.3 (2020-01-02) +------------------ + +* Fixed the wrong cStringIO import for Python 3 +* Removed Travis-CI file in favor of the Github actions + +0.2.2 (2020-01-02) +------------------ + +* Hide the progressbar completely if verbose is set to ``False`` +* Restructured the requirement files and added flake8 to Travis CI + +0.2.1 (2019-12-20) +------------------ + +* Added field based, regular expression excludes (to skip data under certain conditions). + Currently only regular expressions are supported and the exclusion affects the whole row, + not just one single column. + +0.2.0 (2019-12-20) +------------------ + +* Added provider classes +* Added new providers: + * choice - returns a random list element + * mask - replaces the original value with a static sign + +0.1.1 (2019-12-18) +------------------ + +Changed setup.py + +0.1.0 (2019-12-16) +------------------ + +Initial release of the prototype diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index c764048..ed67cb9 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -17,14 +17,14 @@ Making changes Create a fork if you want to make changes or clone the repo if you want a readonly access to the current development version: -.. code-block:: sh +.. code-block:: bash $ git clone git@github.com:rheinwerk-verlag/postgresql-anonymizer.git $ cd postgresql-anonymizer For the development use a virtualenv or install the requirements directly: -.. code-block:: sh +.. code-block:: bash $ sudo pip install -r requirements.txt diff --git a/README.rst b/README.rst index 516ef0d..d06acd2 100644 --- a/README.rst +++ b/README.rst @@ -56,14 +56,14 @@ Installation The default installation method is to use ``pip``: -.. code-block:: sh +.. code-block:: $ pip install pganonymize Usage ----- -.. code-block:: sh +.. code-block:: usage: pganonymize [-h] [-v] [-l] [--schema SCHEMA] [--dbname DBNAME] [--user USER] [--password PASSWORD] [--host HOST] @@ -94,7 +94,7 @@ all anonymization rules for that database. Take a look at the `schema documentat Example calls: -.. code-block:: sh +.. code-block:: $ pganonymize --schema=myschema.yml \ --dbname=test_database \ @@ -118,13 +118,13 @@ With the ``--dump-file`` argument it is possible to create a dump file after ano that the ``pg_dump`` command from the ``postgresql-client-common`` library is necessary to create the dump file for the database, e.g. under Linux: -.. code-block:: sh +.. code-block:: $ sudo apt-get install postgresql-client-common Example call: -.. code-block:: sh +.. code-block:: $ pganonymize --schema=myschema.yml \ --dbname=test_database \ @@ -139,13 +139,13 @@ Docker If you want to run the anonymizer within a Docker container you first have to build the image: -.. code-block:: sh +.. code-block:: $ docker build -t pganonymize . After that you can pass a schema file to the container, using Docker volumes, and call the anonymizer: -.. code-block:: sh +.. code-block:: $ docker run \ -v :/schema.yml \ diff --git a/docs/changelog.md b/docs/changelog.md deleted file mode 120000 index 04c99a5..0000000 --- a/docs/changelog.md +++ /dev/null @@ -1 +0,0 @@ -../CHANGELOG.md \ No newline at end of file diff --git a/docs/changelog.rst b/docs/changelog.rst new file mode 100644 index 0000000..565b052 --- /dev/null +++ b/docs/changelog.rst @@ -0,0 +1 @@ +.. include:: ../CHANGELOG.rst diff --git a/docs/conf.py b/docs/conf.py index 4f535cd..cf7f31d 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -15,8 +15,6 @@ import sys import os -from recommonmark.parser import CommonMarkParser - # If extensions (or modules to document with autodoc) are in another # directory, add these directories to sys.path here. If the directory is # relative to the documentation root, use os.path.abspath to make it @@ -47,12 +45,10 @@ # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] -source_parsers = { - '.md': CommonMarkParser, -} +#source_parsers = {} # The suffix of source filenames. -source_suffix = ['.rst', '.md'] +source_suffix = ['.rst'] # The encoding of source files. #source_encoding = 'utf-8-sig' @@ -148,7 +144,7 @@ # here, relative to this directory. They are copied after the builtin # static files, so a file named "default.css" will overwrite the builtin # "default.css". -#html_static_path = ['_static'] +html_static_path = ['_static'] # If not '', a 'Last updated on:' timestamp is inserted at every page # bottom, using the given strftime format. diff --git a/docs/index.rst b/docs/index.rst index 091dd01..e436d9d 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -8,6 +8,7 @@ Contents: readme schema + localization api tests documentation diff --git a/docs/localization.rst b/docs/localization.rst new file mode 100644 index 0000000..3d15e3f --- /dev/null +++ b/docs/localization.rst @@ -0,0 +1,71 @@ +Localization +============ + +It's possible to use the localization feature of ``Faker`` to localize the generated data. + +To localize the data, add the locales to use as a global option to the YAML schema: + +.. code-block:: yaml + + tables: + auth_user: + fields: + - name: + provider: + name: fake.name + - street: + provider: + name: fake.street_address + - city: + provider: + name: fake.city + + options: + faker: + locales: + - de_DE + - en_US + +Now any field using the ``Faker`` provider will generate localized data. When multiple locales are configured, ``Faker`` +will use its `Multiple Locale Mode `_. +In the example above, ``Faker`` selects the locale randomly for each field and row. + +It's also possible to define the locale to use on field level and to define a default locale: + +.. code-block:: yaml + + tables: + - user: + primary_key: id + fields: + - name: + provider: + # No locale entry at all, use configured default_locale "de_DE" + name: fake.name + - city: + provider: + # Use "en_US" + name: fake.city + locale: en_US + - street: + provider: + # Use "cs_CZ" + name: fake.street_address + locale: cs_CZ + - zipcode: + provider: + # Use empty locale to ignore default_locale and to randomly select locale + name: fake.postcode + locale: + + options: + faker: + locales: + - de_DE + - en_US + - cs_CZ + default_locale: de_DE + +.. ATTENTION:: + Make sure that the ``Faker`` provider (e.g. ``street_name``) is supported by the + `Localized Provider `_. diff --git a/docs/readme.md b/docs/readme.md deleted file mode 120000 index 32d46ee..0000000 --- a/docs/readme.md +++ /dev/null @@ -1 +0,0 @@ -../README.md \ No newline at end of file diff --git a/docs/schema.rst b/docs/schema.rst index b726a2c..b8a99b3 100644 --- a/docs/schema.rst +++ b/docs/schema.rst @@ -274,6 +274,8 @@ with ``fake`` and then use the function name from the Faker library, e.g: Some fake functions allow additional parameters to be passed, these can be specified in the schema as ``kwargs``. +For localization options see :doc:`localization`. + .. note:: Please note: using the ``Faker`` library will generate randomly generated data for each data row within a table. This will dramatically slow down the anonymization process. diff --git a/pganonymize/cli.py b/pganonymize/cli.py index dbf22a6..4a85c6b 100644 --- a/pganonymize/cli.py +++ b/pganonymize/cli.py @@ -6,9 +6,10 @@ import logging import time +from pganonymize.config import config from pganonymize.constants import DATABASE_ARGS, DEFAULT_SCHEMA_FILE from pganonymize.providers import provider_registry -from pganonymize.utils import anonymize_tables, create_database_dump, get_connection, load_config, truncate_tables +from pganonymize.utils import anonymize_tables, create_database_dump, get_connection, truncate_tables def get_pg_args(args): @@ -62,7 +63,7 @@ def main(args): list_provider_classes() return 0 - schema = load_config(args.schema) + config.schema_file = args.schema pg_args = get_pg_args(args) connection = get_connection(pg_args) @@ -73,8 +74,8 @@ def main(args): cursor.close() start_time = time.time() - truncate_tables(connection, schema.get('truncate', [])) - anonymize_tables(connection, schema.get('tables', []), verbose=args.verbose, dry_run=args.dry_run) + truncate_tables(connection) + anonymize_tables(connection, verbose=args.verbose, dry_run=args.dry_run) if not args.dry_run: connection.commit() diff --git a/pganonymize/config.py b/pganonymize/config.py new file mode 100644 index 0000000..0879996 --- /dev/null +++ b/pganonymize/config.py @@ -0,0 +1,57 @@ +import os +import re + +import yaml + + +class Config(object): + """A class that wraps access to the given YAML schema file.""" + + def __init__(self): + self._schema = None + self.schema_file = None + + @property + def schema(self): + """ + Return the schema loaded from the given YAML schema file. + + :return: The parsed YAML schema. + :rtype: dict + """ + if self._schema is None and self.schema_file is not None: + self._schema = load_schema(self.schema_file) + return self._schema + + +def load_schema(schema_file): + # Original code from here https://gist.github.com/mkaranasou/ba83e25c835a8f7629e34dd7ede01931 + tag = '!ENV' + pattern = re.compile(r'.*?\${(\w+)}.*?') + custom_loader = yaml.FullLoader + custom_loader.add_implicit_resolver(tag, pattern, None) + + def constructor_env_variables(loader, node): + """ + Extract the environment variable from the node's value. + + :param yaml.Loader loader: The yaml loader + :param node: The current node in the yaml + :return: The parsed string that contains the value of the environment variable + """ + value = loader.construct_scalar(node) + match = pattern.findall(value) # to find all env variables in line + if match: + full_value = value + for g in match: + full_value = full_value.replace( + '${{{g}}}'.format(g=g), os.environ.get(g, g) + ) + return full_value + return value + + custom_loader.add_constructor(tag, constructor_env_variables) + return yaml.load(open(schema_file), Loader=custom_loader) + + +config = Config() diff --git a/pganonymize/providers.py b/pganonymize/providers.py index 0038521..cbc4c86 100644 --- a/pganonymize/providers.py +++ b/pganonymize/providers.py @@ -7,9 +7,58 @@ from faker import Faker +from pganonymize.config import config from pganonymize.exceptions import InvalidProvider, InvalidProviderArgument, ProviderAlreadyRegistered -fake_data = Faker() + +class FakerInitializer(object): + """A wrapper that allows to instantiate a faker instance with specific locales.""" + + def __init__(self): + self._faker = None + self._options = None + + @property + def options(self): + if self._options is None: + self._options = config.schema.get('options', {}).get('faker', {}) + return self._options + + @property + def default_locale(self): + return self.options.get('default_locale') + + @property + def faker(self): + """ + Return the actual :class:`faker.Faker` instance, with optional locales taken from the YAML schema. + + :return: A faker instance + :rtype: faker.Faker + """ + if self._faker is None: + locales = self.options.get('locales') + self._faker = Faker(locales) + return self._faker + + def get_locale_generator(self, locale): + """ + Get the internal generator for the given locale. + + :param str locale: A locale string + :raises InvalidProviderArgument: If locale is unknown (not configured within the global locales option). + :return: A Generator instance for the given locale + :rtype: faker.Generator + """ + try: + generator = self.faker[locale] + except KeyError: + raise InvalidProviderArgument('Locale \'{}\' is unknown. Have you added it to the global option ' + '(options.faker.locales)?'.format(locale)) + return generator + + +faker_initializer = FakerInitializer() class ProviderRegistry(object): @@ -33,10 +82,12 @@ def register(self, provider_class, provider_id): def get_provider(self, provider_id): """ - Return a provider by it's provider id. + Return a provider by its provider id. :param str provider_id: The string id of the desired provider. :raises InvalidProvider: If no provider can be found with the given id. + :return: The provider class that matches the id. + :rtype: type """ for key, cls in self._registry.items(): if (cls.regex_match is True and re.match(re.compile(key), provider_id) is not None) or key == provider_id: @@ -78,14 +129,12 @@ class Provider(object): regex_match = False """Defines whether a provider matches it's id using regular expressions.""" - def __init__(self, **kwargs): - self.kwargs = kwargs - - def alter_value(self, value): + @classmethod + def alter_value(cls, original_value, **kwargs): """ Alter or replace the original value of the database column. - :param value: The original value of the database column. + :param original_value: The original value of the database column. """ raise NotImplementedError() @@ -94,15 +143,17 @@ def alter_value(self, value): class ChoiceProvider(Provider): """Provider that returns a random value from a list of choices.""" - def alter_value(self, value): - return random.choice(self.kwargs.get('values')) + @classmethod + def alter_value(cls, original_value, **kwargs): + return random.choice(kwargs.get('values')) @register('clear') class ClearProvider(Provider): """Provider to set a field value to None.""" - def alter_value(self, value): + @classmethod + def alter_value(cls, original_value, **kwargs): return None @@ -112,11 +163,15 @@ class FakeProvider(Provider): regex_match = True - def alter_value(self, value): - func_name = self.kwargs['name'].split('.', 1)[1] - func_kwargs = self.kwargs.get('kwargs', {}) + @classmethod + def alter_value(cls, original_value, **kwargs): + func_name = kwargs['name'].split('.', 1)[1] + func_kwargs = kwargs.get('kwargs', {}) + locale = kwargs.get('locale', faker_initializer.default_locale) + # Use the generator for the locale if a locale is configured (per field definition or as global default locale) + faker_generator = faker_initializer.get_locale_generator(locale) if locale else faker_initializer.faker try: - func = operator.attrgetter(func_name)(fake_data) + func = operator.attrgetter(func_name)(faker_generator) except AttributeError as exc: raise InvalidProviderArgument(exc) return func(**func_kwargs) @@ -129,9 +184,10 @@ class MaskProvider(Provider): default_sign = 'X' """The default string used to replace each character.""" - def alter_value(self, value): - sign = self.kwargs.get('sign', self.default_sign) or self.default_sign - return sign * len(value) + @classmethod + def alter_value(cls, original_value, **kwargs): + sign = kwargs.get('sign', cls.default_sign) or cls.default_sign + return sign * len(original_value) @register('partial_mask') @@ -143,15 +199,16 @@ class PartialMaskProvider(Provider): default_unmasked_right = 1 """The default string used to replace each character.""" - def alter_value(self, value): - sign = self.kwargs.get('sign', self.default_sign) or self.default_sign - unmasked_left = self.kwargs.get('unmasked_left', self.default_unmasked_left) or self.default_unmasked_left - unmasked_right = self.kwargs.get('unmasked_right', self.default_unmasked_right) or self.default_unmasked_right + @classmethod + def alter_value(cls, original_value, **kwargs): + sign = kwargs.get('sign', cls.default_sign) or cls.default_sign + unmasked_left = kwargs.get('unmasked_left', cls.default_unmasked_left) or cls.default_unmasked_left + unmasked_right = kwargs.get('unmasked_right', cls.default_unmasked_right) or cls.default_unmasked_right return ( - value[:unmasked_left] + - (len(value) - (unmasked_left + unmasked_right)) * sign + - value[-unmasked_right:] + original_value[:unmasked_left] + + (len(original_value) - (unmasked_left + unmasked_right)) * sign + + original_value[-unmasked_right:] ) @@ -162,10 +219,11 @@ class MD5Provider(Provider): default_max_length = 8 """The default length used for the number representation.""" - def alter_value(self, value): - as_number = self.kwargs.get('as_number', False) - as_number_length = self.kwargs.get('as_number_length', self.default_max_length) - hashed = md5(value.encode('utf-8')).hexdigest() + @classmethod + def alter_value(cls, original_value, **kwargs): + as_number = kwargs.get('as_number', False) + as_number_length = kwargs.get('as_number_length', cls.default_max_length) + hashed = md5(original_value.encode('utf-8')).hexdigest() if as_number: return int(hashed, 16) % (10 ** as_number_length) else: @@ -176,13 +234,15 @@ def alter_value(self, value): class SetProvider(Provider): """Provider to set a static value.""" - def alter_value(self, value): - return self.kwargs.get('value') + @classmethod + def alter_value(cls, original_value, **kwargs): + return kwargs.get('value') @register('uuid4') class UUID4Provider(Provider): """Provider to set a random uuid value.""" - def alter_value(self, value): + @classmethod + def alter_value(cls, original_value, **kwargs): return uuid4() diff --git a/pganonymize/utils.py b/pganonymize/utils.py index f0b31e8..b756551 100644 --- a/pganonymize/utils.py +++ b/pganonymize/utils.py @@ -5,7 +5,6 @@ import json import logging import math -import os import re import subprocess import time @@ -13,24 +12,27 @@ import parmap import psycopg2 import psycopg2.extras -import yaml from pgcopy import CopyManager from psycopg2.sql import SQL, Composed, Identifier from tqdm import trange +from pganonymize.config import config from pganonymize.constants import DEFAULT_CHUNK_SIZE, DEFAULT_PRIMARY_KEY from pganonymize.providers import provider_registry +# Needed to work with UUID objects +psycopg2.extras.register_uuid() -def anonymize_tables(connection, definitions, verbose=False, dry_run=False): + +def anonymize_tables(connection, verbose=False, dry_run=False): """ Anonymize a list of tables according to the schema definition. :param connection: A database connection instance. - :param list definitions: A list of table definitions from the YAML schema. :param bool verbose: Display logging information and a progress bar. :param bool dry_run: Script is running in dry-run mode, no commit expected. """ + definitions = config.schema.get('tables', []) for definition in definitions: start_time = time.time() table_name = list(definition.keys())[0] @@ -233,26 +235,26 @@ def get_column_values(row, columns): orig_value = nested_get(row, full_column_name) # Skip the current column if there is no value to be altered if orig_value is not None: - provider = provider_registry.get_provider(provider_config['name'])(**provider_config) - value = provider.alter_value(orig_value) + provider_class = provider_registry.get_provider(provider_config['name']) + value = provider_class.alter_value(orig_value, **provider_config) append = column_definition.get('append') if append: value = value + append - format = column_definition.get('format') - if format: - value = format.format(pga_value=value, **row) + _format = column_definition.get('format') + if _format: + value = _format.format(pga_value=value, **row) nested_set(row, full_column_name, value) column_dict[column_name] = nested_get(row, column_name) return column_dict -def truncate_tables(connection, tables): +def truncate_tables(connection): """ Truncate a list of tables. :param connection: A database connection instance - :param list[str] tables: A list of table names """ + tables = config.schema.get('truncate', []) if not tables: return cursor = connection.cursor() @@ -356,33 +358,3 @@ def nested_set(dic, path, value, delimiter='.'): for key in keys[:-1]: dic = dic.get(key, {}) dic[keys[-1]] = value - - -def load_config(schema): - # Original code from here https://gist.github.com/mkaranasou/ba83e25c835a8f7629e34dd7ede01931 - tag = '!ENV' - pattern = re.compile(r'.*?\${(\w+)}.*?') - custom_loader = yaml.FullLoader - custom_loader.add_implicit_resolver(tag, pattern, None) - - def constructor_env_variables(loader, node): - """ - Extract the environment variable from the node's value. - - :param yaml.Loader loader: The yaml loader - :param node: The current node in the yaml - :return: The parsed string that contains the value of the environment variable - """ - value = loader.construct_scalar(node) - match = pattern.findall(value) # to find all env variables in line - if match: - full_value = value - for g in match: - full_value = full_value.replace( - '${{{g}}}'.format(g=g), os.environ.get(g, g) - ) - return full_value - return value - - custom_loader.add_constructor(tag, constructor_env_variables) - return yaml.load(open(schema), Loader=custom_loader) diff --git a/pganonymize/version.py b/pganonymize/version.py index d452437..a8cad44 100644 --- a/pganonymize/version.py +++ b/pganonymize/version.py @@ -1,3 +1,3 @@ # -*- coding: utf-8 -*- -__version__ = '0.9.0' +__version__ = '0.10.0' diff --git a/setup.py b/setup.py index aaa98d5..b83774e 100755 --- a/setup.py +++ b/setup.py @@ -76,7 +76,7 @@ def run(self): url='https://github.com/rheinwerk-verlag/pganonymize', license='MIT license', classifiers=[ - 'Development Status :: 3 - Alpha', + 'Development Status :: 4 - Beta', 'Environment :: Console', 'Intended Audience :: Developers', 'License :: OSI Approved :: MIT License', diff --git a/tests/conftest.py b/tests/conftest.py index 40a96af..958038a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1 +1,27 @@ # -*- coding: utf-8 -*- +import pytest +from mock.mock import patch + +from pganonymize.config import config +from pganonymize.providers import faker_initializer + + +@pytest.fixture +def valid_config(): + # Patch the config instance with a valid schema + with patch.multiple('pganonymize.config.config', schema_file='./tests/schemes/valid_schema.yml', _schema=None): + yield config + + +@pytest.fixture +def mocked_faker_initializer(): + # Patch the faker_initializer instance with a Faker mock + with patch('pganonymize.providers.faker_initializer._faker'): + yield faker_initializer + + +@pytest.fixture +def faker_initializer_with_localization(mocked_faker_initializer): + # Patch the faker_initializer instance with localization options + with patch.object(mocked_faker_initializer, '_options', {'locales': ('de_DE', 'en_US'), 'default_locale': 'en_US'}): + yield mocked_faker_initializer diff --git a/tests/test_cli.py b/tests/test_cli.py index 02d821a..616b6b5 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -9,7 +9,8 @@ from pganonymize.cli import get_arg_parser, main -class TestCli: +class TestCli(object): + @patch('psycopg2.extensions.quote_ident', side_effect=quote_ident) @patch('pganonymize.utils.psycopg2.connect') @patch('pganonymize.utils.subprocess') diff --git a/tests/test_config.py b/tests/test_config.py new file mode 100644 index 0000000..2b6485d --- /dev/null +++ b/tests/test_config.py @@ -0,0 +1,61 @@ +import os + +import pytest +from mock import patch + +from pganonymize.config import load_schema + + +@pytest.mark.parametrize('file, envs, expected', [ + [ + './tests/schemes/valid_schema.yml', + {}, + { + 'tables': [ + { + 'auth_user': { + 'primary_key': 'id', + 'chunk_size': 5000, + 'fields': [ + {'first_name': {'provider': {'name': 'fake.first_name'}}}, + {'last_name': {'provider': {'name': 'set', 'value': 'Bar'}}}, + {'email': {'provider': {'name': 'md5'}, 'append': '@localhost'}}, + ], + 'excludes': [ + {'email': ['\\S[^@]*@example\\.com']}, + ] + } + } + ], + 'truncate': ['django_session'] + } + ], + [ + './tests/schemes/schema_with_env_variables.yml', + { + 'TEST_CHUNK_SIZE': '123', + 'TEST_PRIMARY_KEY': 'foo-bar', + 'PRESENT_WORLD_NAME': 'beautiful world', + 'COMPANY_ID': '42', + 'USER_TO_BE_SEARCHED': 'i wanna be forgotten', + }, + { + 'primary_key': 'foo-bar', + 'primary_key2': 'foo-bar', + 'chunk_size': '123', + 'concat_missing': 'Hello, MISSING_ENV_VAL', + 'concat_missing2': 'Hello, ${MISSING_ENV_VAL}', + 'concat_present': 'Hello, beautiful world', + 'concat_present2': 'beautiful world', + 'concat_present3': 'Hello, beautiful world', + 'search': 'id = 42', + 'search2': "username = 'i wanna be forgotten'", + 'corrupted': "username = '${CORRUPTED", + 'corrupted2': '', + 'corrupted3': '$', + } + ] +]) +def test_load_schema(file, envs, expected): + with patch.dict(os.environ, envs): + assert load_schema(file) == expected diff --git a/tests/test_providers.py b/tests/test_providers.py index 354a53d..45233e2 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -7,6 +7,7 @@ from mock import MagicMock, Mock, call, patch from pganonymize import exceptions, providers +from pganonymize.exceptions import InvalidProviderArgument def test_register(): @@ -29,7 +30,7 @@ def alter_value(self, value): assert 'bar' in registry._registry -class TestProviderRegistry: +class TestProviderRegistry(object): def test_constructor(self): registry = providers.ProviderRegistry() @@ -91,7 +92,7 @@ def test_providers(self): pass -class TestProvider: +class TestProvider(object): def test_alter_value(self): provider = providers.Provider() @@ -99,59 +100,74 @@ def test_alter_value(self): provider.alter_value('Foo') -class TestChoiceProvider: +class TestChoiceProvider(object): def test_alter_value(self): choices = ['Foo', 'Bar', 'Baz'] - provider = providers.ChoiceProvider(values=choices) for choice in choices: - assert provider.alter_value(choice) in choices + assert providers.ChoiceProvider.alter_value(choice, values=choices) in choices -class TestClearProvider: +class TestClearProvider(object): def test_alter_value(self): provider = providers.ClearProvider() assert provider.alter_value('Foo') is None -class TestFakeProvider: +@pytest.mark.usefixtures('valid_config') +class TestFakeProvider(object): @pytest.mark.parametrize('name, function_name', [ ('fake.first_name', 'first_name'), ('fake.unique.first_name', 'unique.first_name'), ]) - @patch('pganonymize.providers.fake_data') - def test_alter_value(self, mock_fake_data, name, function_name): - provider = providers.FakeProvider(name=name) - provider.alter_value('Foo') - assert operator.attrgetter(function_name)(mock_fake_data).call_count == 1 + @patch('pganonymize.providers.faker_initializer._faker') + def test_alter_value(self, mock_faker, name, function_name): + providers.FakeProvider.alter_value('Foo', name=name) + assert operator.attrgetter(function_name)(mock_faker).call_count == 1 @pytest.mark.parametrize('name', ['fake.foo_name']) def test_invalid_names(self, name): - provider = providers.FakeProvider(name=name) with pytest.raises(exceptions.InvalidProviderArgument): - provider.alter_value('Foo') + providers.FakeProvider.alter_value('Foo', name=name) + + @patch('pganonymize.providers.faker_initializer._faker') + def test_alter_value_with_kwargs(self, mock_faker): + providers.FakeProvider.alter_value('Foo', name='fake.date_of_birth', kwargs={'minimum_age': 18}) + assert mock_faker.date_of_birth.call_args == call(minimum_age=18) + + @patch('pganonymize.providers.faker_initializer._faker') + def test_alter_value_with_locale(self, mock_faker): + providers.FakeProvider.alter_value('Foo', name='fake.date_of_birth', locale='de_DE') + assert mock_faker['de_DE'].date_of_birth.call_count == 1 + + def test_alter_value_with_unkown_locale(self): + with pytest.raises(InvalidProviderArgument): + providers.FakeProvider.alter_value('Foo', name='fake.date_of_birth', locale='de_DE') - @patch('pganonymize.providers.fake_data') - def test_alter_value_with_kwargs(self, mock_fake_data): - provider = providers.FakeProvider(name='fake.date_of_birth', kwargs={'minimum_age': 18}) - provider.alter_value('Foo') - assert mock_fake_data.date_of_birth.call_args == call(minimum_age=18) + def test_alter_value_use_default_locale(self, faker_initializer_with_localization): + providers.FakeProvider.alter_value('Foo', name='fake.date_of_birth') + faker = faker_initializer_with_localization._faker + assert faker[faker_initializer_with_localization.default_locale].date_of_birth.call_count == 1 + def test_alter_value_ignore_default_locale(self, faker_initializer_with_localization): + providers.FakeProvider.alter_value('Foo', name='fake.date_of_birth', locale=None) + faker = faker_initializer_with_localization._faker + assert faker.date_of_birth.call_count == 1 -class TestMaskProvider: + +class TestMaskProvider(object): @pytest.mark.parametrize('value, sign, expected', [ ('Foo', None, 'XXX'), ('Baaaar', '?', '??????'), ]) def test_alter_value(self, value, sign, expected): - provider = providers.MaskProvider(sign=sign) - assert provider.alter_value(value) == expected + assert providers.MaskProvider.alter_value(value, sign=sign) == expected -class TestPartialMaskProvider: +class TestPartialMaskProvider(object): @pytest.mark.parametrize('value, sign, unmasked_left, unmasked_right, expected', [ ('Foo', None, 1, 1, 'FXo'), @@ -159,15 +175,11 @@ class TestPartialMaskProvider: ('Baaaar', '?', 2, 1, 'Ba???r'), ]) def test_alter_value(self, value, sign, unmasked_left, unmasked_right, expected): - provider = providers.PartialMaskProvider( - sign=sign, - unmasked_left=unmasked_left, - unmasked_right=unmasked_right - ) - assert provider.alter_value(value) == expected + assert providers.PartialMaskProvider.alter_value(value, sign=sign, unmasked_left=unmasked_left, + unmasked_right=unmasked_right) == expected -class TestMD5Provider: +class TestMD5Provider(object): def test_alter_value(self): provider = providers.MD5Provider() @@ -176,33 +188,26 @@ def test_alter_value(self): assert len(value) == 32 def test_as_number(self): - provider = providers.MD5Provider(as_number=True) - value = provider.alter_value('foo') + value = providers.MD5Provider.alter_value('foo', as_number=True) assert isinstance(value, six.integer_types) assert value == 985560 - - provider = providers.MD5Provider(as_number=True, as_number_length=8) - value = provider.alter_value('foobarbazadasd') + value = providers.MD5Provider.alter_value('foobarbazadasd', as_number=True, as_number_length=8) assert isinstance(value, six.integer_types) assert value == 45684001 -class TestSetProvider: +class TestSetProvider(object): @pytest.mark.parametrize('kwargs, expected', [ ({'value': None}, None), ({'value': 'Bar'}, 'Bar') ]) def test_alter_value(self, kwargs, expected): - provider = providers.SetProvider(**kwargs) - assert provider.alter_value('Foo') == expected + assert providers.SetProvider.alter_value('Foo', **kwargs) == expected -class TestUUID4Provider: - @pytest.mark.parametrize('kwargs, expected', [ - ({'value': None}, None), - ({'value': 'Bar'}, 'Bar') - ]) - def test_alter_value(self, kwargs, expected): - provider = providers.UUID4Provider(**kwargs) - assert type(provider.alter_value('Foo')) == uuid.UUID +class TestUUID4Provider(object): + + @pytest.mark.parametrize('value, expected', [(None, uuid.UUID), ('Foo', uuid.UUID)]) + def test_alter_value(self, value, expected): + assert type(providers.UUID4Provider.alter_value(value)) == expected diff --git a/tests/test_utils.py b/tests/test_utils.py index 9038ba9..ac5cdc5 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,5 +1,4 @@ import math -import os from collections import OrderedDict, namedtuple import pytest @@ -7,11 +6,13 @@ from tests.utils import quote_ident -from pganonymize.utils import (anonymize_tables, build_and_then_import_data, create_database_dump, - get_column_values, get_connection, import_data, load_config, truncate_tables) +from pganonymize.utils import ( + anonymize_tables, build_and_then_import_data, create_database_dump, get_column_values, get_connection, import_data, + truncate_tables, +) -class TestGetConnection: +class TestGetConnection(object): @patch('pganonymize.utils.psycopg2.connect') def test(self, mock_connect): @@ -26,7 +27,8 @@ def test(self, mock_connect): mock_connect.assert_called_once_with(**connection_data) -class TestTruncateTables: +class TestTruncateTables(object): + @patch('psycopg2.extensions.quote_ident', side_effect=quote_ident) @pytest.mark.parametrize('tables, expected', [ [('table_a', 'table_b', 'CAPS_TABLe'), 'TRUNCATE TABLE "table_a", "table_b", "CAPS_TABLe"'], @@ -36,7 +38,8 @@ def test(self, quote_ident, tables, expected): mock_cursor = Mock() connection = Mock() connection.cursor.return_value = mock_cursor - truncate_tables(connection, tables) + with patch.multiple('pganonymize.config.config', schema_file=None, _schema={'truncate': tables}): + truncate_tables(connection) if tables: connection.cursor.assert_called_once() assert mock_cursor.execute.call_args_list == [call(expected)] @@ -47,7 +50,7 @@ def test(self, quote_ident, tables, expected): mock_cursor.close.assert_not_called() -class TestImportData: +class TestImportData(object): @patch('psycopg2.extensions.quote_ident', side_effect=quote_ident) @patch('pgcopy.copy.util') @patch('pgcopy.copy.inspect') @@ -84,8 +87,9 @@ def test(self, inspect, util, quote_ident, tmp_table, cols, data): assert mock_cursor.copy_expert.call_args_list == expected @patch('pganonymize.utils.CopyManager') + @patch('pganonymize.utils.config') @patch('psycopg2.extensions.quote_ident', side_effect=quote_ident) - def test_anonymize_tables(self, quote_ident, copy_manager): + def test_anonymize_tables(self, quote_ident, mock_config, copy_manager): mock_cursor = Mock() mock_cursor.fetchone.return_value = [2] mock_cursor.fetchmany.side_effect = [ @@ -102,8 +106,8 @@ def test_anonymize_tables(self, quote_ident, copy_manager): connection = Mock() connection.cursor.return_value = mock_cursor - definitions = [] - anonymize_tables(connection, definitions, verbose=True) + mock_config.schema = {'tables': []} + anonymize_tables(connection, verbose=True) assert connection.cursor.call_count == 0 assert mock_cursor.close.call_count == 0 @@ -148,8 +152,8 @@ def test_anonymize_tables(self, quote_ident, copy_manager): } } ] - - anonymize_tables(connection, definitions, verbose=True) + mock_config.schema = {'tables': definitions} + anonymize_tables(connection, verbose=True) assert connection.cursor.call_count == mock_cursor.close.call_count assert copy_manager.call_args_list == [call(connection, 'tmp_auth_user', ['id', 'first_name', 'json_column'])] assert cmm.copy.call_count == 1 @@ -157,7 +161,7 @@ def test_anonymize_tables(self, quote_ident, copy_manager): ['dummy nameappend-me', b'{"field2": "dummy json field2"}']])] -class TestBuildAndThenImport: +class TestBuildAndThenImport(object): @patch('psycopg2.extensions.quote_ident', side_effect=quote_ident) @patch('pganonymize.utils.CopyManager') @pytest.mark.parametrize('table, primary_key, columns, total_count, chunk_size', [ @@ -228,68 +232,10 @@ def test_column_format(self, copy_manager): assert result == expected -class TestCreateDatabaseDump: +class TestCreateDatabaseDump(object): @patch('pganonymize.utils.subprocess.call') def test(self, mock_call): create_database_dump('/tmp/dump.gz', {'dbname': 'database', 'user': 'foo', 'host': 'localhost', 'port': 5432}) mock_call.assert_called_once_with('pg_dump -Fc -Z 9 -d database -U foo -h localhost -p 5432 -f /tmp/dump.gz', shell=True) - - -class TestConfigLoader: - - @pytest.mark.parametrize('file, envs, expected', [ - [ - './tests/schemes/valid_schema.yml', - {}, - { - 'tables': [ - { - 'auth_user': { - 'primary_key': 'id', - 'chunk_size': 5000, - 'fields': [ - {'first_name': {'provider': {'name': 'fake.first_name'}}}, - {'last_name': {'provider': {'name': 'set', 'value': 'Bar'}}}, - {'email': {'provider': {'name': 'md5'}, 'append': '@localhost'}}, - ], - 'excludes': [ - {'email': ['\\S[^@]*@example\\.com']}, - ] - } - } - ], - 'truncate': ['django_session'] - } - ], - [ - './tests/schemes/schema_with_env_variables.yml', - { - 'TEST_CHUNK_SIZE': '123', - 'TEST_PRIMARY_KEY': 'foo-bar', - 'PRESENT_WORLD_NAME': 'beautiful world', - 'COMPANY_ID': '42', - 'USER_TO_BE_SEARCHED': 'i wanna be forgotten', - }, - { - 'primary_key': 'foo-bar', - 'primary_key2': 'foo-bar', - 'chunk_size': '123', - 'concat_missing': 'Hello, MISSING_ENV_VAL', - 'concat_missing2': 'Hello, ${MISSING_ENV_VAL}', - 'concat_present': 'Hello, beautiful world', - 'concat_present2': 'beautiful world', - 'concat_present3': 'Hello, beautiful world', - 'search': 'id = 42', - 'search2': "username = 'i wanna be forgotten'", - 'corrupted': "username = '${CORRUPTED", - 'corrupted2': '', - 'corrupted3': '$', - } - ] - ]) - def test(self, file, envs, expected): - with patch.dict(os.environ, envs): - print(load_config(file)) - assert load_config(file) == expected