Skip to content

Commit

Permalink
Merge branch 'development'
Browse files Browse the repository at this point in the history
  • Loading branch information
hkage committed Dec 20, 2019
2 parents 5cffe06 + 4514612 commit 87b3196
Show file tree
Hide file tree
Showing 5 changed files with 64 additions and 8 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@

## Development

## 0.2.1 (2019-12-20)

* Added field based, regular expression excludes (to skip data under certain conditions).
Currently only regular expressions are supported and the exlusion affects the whole row,
not just one single column.

## 0.2.0 (2019-12-20)

* Added provider classes
Expand Down
22 changes: 20 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,25 @@ be treated.
name: md5
append: @localhost

For each table you can also specify a list of ``excludes``. Each entry has to be a field name which contains
a list of exclude patterns. If one of these patterns matches, the whole table row won't ne anonymized.

**Example:**::

tables:
- auth_user:
primary_key: id
fields:
- first_name:
provider:
name: clear
excludes:
- email:
- "\\S.*@example.com"

This will exclude all data from the table ``auth_user`` that have an ``email`` field which matches the
regular expression pattern ``\S.*@example.com`` (the backslash is to escape the string for YAML).


Providers
---------
Expand Down Expand Up @@ -300,8 +319,7 @@ the ``BROWSER`` make variable, e.g.::

TODOs
-----
* Add tests
* Add exceptions for certain field values
* Add more tests
* Add option to create a database dump
* Add a commandline argument to list all available providers

Expand Down
39 changes: 34 additions & 5 deletions pganonymizer/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import csv
import logging
import re
from cStringIO import StringIO

import psycopg2
Expand All @@ -25,20 +26,22 @@ def anonymize_tables(connection, definitions, verbose=False):
logging.info('Found table definition "%s"', table_name)
table_definition = definition[table_name]
columns = table_definition.get('fields', [])
excludes = table_definition.get('excludes', [])
column_dict = get_column_dict(columns)
primary_key = table_definition.get('primary_key', DEFAULT_PRIMARY_KEY)
total_count = get_table_count(connection, table_name)
data, table_columns = build_data(connection, table_name, columns, total_count, verbose)
data, table_columns = build_data(connection, table_name, columns, excludes, total_count, verbose)
import_data(connection, column_dict, table_name, table_columns, primary_key, data)


def build_data(connection, table, columns, total_count, verbose=False):
def build_data(connection, table, columns, excludes, total_count, verbose=False):
"""
Select all data from a table and build
:param connection: A database connection instance.
:param str table: Name of the table to retrieve the data.
:param list columns:
:param list[dict] excludes: A list of exclude definitions.
:param int total_count: The amount of rows for the current table
:param bool verbose: Display logging information and a progress bar.
:return: A tuple containing the data list and a complete list of all table columns.
Expand All @@ -55,9 +58,11 @@ def build_data(connection, table, columns, total_count, verbose=False):
if not records:
break
for row in records:
row_column_dict = get_column_values(row, columns)
for key, value in row_column_dict.items():
row[key] = value
row_column_dict = {}
if not row_matches_excludes(row, excludes):
row_column_dict = get_column_values(row, columns)
for key, value in row_column_dict.items():
row[key] = value
if verbose:
bar.next()
table_columns = row.keys()
Expand All @@ -70,6 +75,30 @@ def build_data(connection, table, columns, total_count, verbose=False):
return data, table_columns


def row_matches_excludes(row, excludes=None):
"""
Check whether a row matches a list of field exclusion patterns.
:param list row: The data row
:param list excludes: A list of field exclusion roles, e.g.:
[
{'email': ['\\S.*@example.com', '\\S.*@foobar.com', ]}
]
:return: True or False
:rtype: bool
"""
excludes = excludes if excludes else []
for definition in excludes:
column = definition.keys()[0]
for exclude in definition.get(column, []):
pattern = re.compile(exclude, re.IGNORECASE)
if pattern.match(row[column]):
return True
return False


def copy_from(connection, data, table, columns):
"""
Copy the data from a table to a temporary table.
Expand Down
2 changes: 1 addition & 1 deletion pganonymizer/version.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# -*- coding: utf-8 -*-

__version__ = '0.2.0'
__version__ = '0.2.1'
3 changes: 3 additions & 0 deletions sample_schema.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,6 @@ tables:
provider:
name: md5
append: "@localhost"
excludes:
- email:
- "\\S.*@example.com"

0 comments on commit 87b3196

Please sign in to comment.