thread · judahrand · Feb 23, 2022 · Feb 23, 2022 · Feb 24, 2022 · Feb 24, 2022
diff --git a/pipelinewise/cli/config.py b/pipelinewise/cli/config.py
@@ -325,6 +325,7 @@ def generate_selection(cls, tap: Dict) -> List[Dict]:
                             # Add replication_key only if replication_method is INCREMENTAL
                             'replication_key': table.get('replication_key')
                             if replication_method == 'INCREMENTAL' else None,
+                            'partition_by': table.get('partition_by'),
                         }
                     )
                 )

diff --git a/pipelinewise/cli/pipelinewise.py b/pipelinewise/cli/pipelinewise.py
@@ -728,6 +728,12 @@ def make_default_selection(self, schema, selection_file):
                         ]['metadata']['replication-key'] = tap_stream_sel[
                             'replication_key'
                         ]
+                    if 'partition_by' in tap_stream_sel:
+                        schema['streams'][stream_idx]['metadata'][
+                            stream_table_mdata_idx
+                        ]['metadata']['partition-by'] = tap_stream_sel[
+                            'partition_by'
+                        ]
                 else:
                     self.logger.debug(
                         'Mark %s tap_stream_id as not selected', tap_stream_id

diff --git a/pipelinewise/cli/schemas/tap.json b/pipelinewise/cli/schemas/tap.json
@@ -85,6 +85,9 @@
                 "LOG_BASED"
               ]
             },
+            "partition_by": {
+              "$ref": "#/definitions/partition_by"
+            },
             "transformations": {
               "type": "array",
               "items": {
@@ -113,6 +116,9 @@
             "replication_key": {
               "type": "string"
             },
+            "partition_by": {
+              "$ref": "#/definitions/partition_by"
+            },
             "transformations": {
               "type": "array",
               "items": {
@@ -323,6 +329,65 @@
           }
         }
       ]
+    },
+    "partition_by": {
+      "type": "object",
+      "properties": {
+        "field": {
+          "type": "string"
+        },
+        "data_type": {
+          "enum": ["timestamp", "date", "datetime", "int64"]
+        },
+        "granularity":{
+          "anyOf": [
+            {
+              "properties": {
+                "data_type": { "const": "date" }
+              },
+              "granularity": ["day", "month", "year"]
+            },
+            {
+              "properties": {
+                "data_type": { "const": "datetime" }
+              },
+              "granularity": ["hour", "day", "month", "year"]
+            },
+            {
+              "properties": {
+                "data_type": { "const": "timestamp" }
+              },
+              "granularity": ["hour", "day", "month", "year"]
+            }
+          ]
+        },
+        "range": {
+          "type": "object",
+          "required": ["start", "end", "interval"],
+          "properties": {
+            "start": {
+              "type": "integer"
+            },
+            "end": {
+              "type": "integer"
+            },
+            "interval": {
+              "type": "integer"
+            }
+          }
+        }
+      },
+      "anyOf": [
+        {
+          "properties": {
+            "data_type": { "const": "int64" }
+          },
+          "required": ["field", "data_type", "range"]
+        },
+        {
+          "required": ["field", "data_type"]
+        }
+      ]
     }
   },
   "type": "object",

diff --git a/pipelinewise/fastsync/commons/target_bigquery.py b/pipelinewise/fastsync/commons/target_bigquery.py
@@ -31,6 +31,32 @@ def safe_name(name, quotes=True):
     return removed_bad_chars
 
 
+def get_partition_by_clause(partition_by: Dict) -> str:
+    """
+    Generate BigQuery specific PARTITION BY clause from dictionary.
+    """
+    field = partition_by['field'].lower()
+    data_type = partition_by['data_type']
+    granularity = partition_by.get('granularity', 'day')
+
+    if data_type == 'int64':
+        sub_clause = (
+            f'RANGE_BUCKET({field}, '
+            f'GENERATE_ARRAY({partition_by["range"]["start"]}, '
+            f'{partition_by["range"]["end"]}, {partition_by["range"]["interval"]}))'
+        )
+    elif data_type == 'date':
+        sub_clause = f'DATE({field})'
+        if granularity != 'day':
+            sub_clause = f'DATE_TRUNC({field}, {granularity})'
+    elif data_type == 'datetime':
+        sub_clause = f'DATETIME_TRUNC({field}, {granularity})'
+    elif data_type == 'timestamp':
+        sub_clause = f'TIMESTAMP_TRUNC({field}, {granularity})'
+
+    return f' PARTITION BY {sub_clause})'
+
+
 # pylint: disable=missing-function-docstring,no-self-use,too-many-arguments
 class FastSyncTargetBigquery:
     """
@@ -194,6 +220,7 @@ def create_table(
         primary_key: Optional[List[str]],
         is_temporary: bool = False,
         sort_columns=False,
+        partition_by: Optional[Dict] = None,
     ):
 
         table_dict = utils.tablename_to_dict(table_name)
@@ -232,9 +259,11 @@ def create_table(
             f'CREATE OR REPLACE TABLE {target_schema}.{target_table} ('
             f'{",".join(columns)})'
         )
+        if partition_by:
+            sql = f'{sql} {get_partition_by_clause(partition_by)}'
         if primary_key:
             primary_key = [c.lower() for c in primary_key]
-            sql = sql + f' CLUSTER BY {",".join(primary_key)}'
+            sql = f'{sql} CLUSTER BY {",".join(primary_key)}'
 
         self.query(sql)
 

diff --git a/pipelinewise/fastsync/commons/utils.py b/pipelinewise/fastsync/commons/utils.py
@@ -5,7 +5,7 @@
 import logging
 import datetime
 
-from typing import Dict
+from typing import Dict, Optional
 from pipelinewise.cli.utils import generate_random_string
 
 LOGGER = logging.getLogger(__name__)
@@ -107,17 +107,13 @@ def get_tables_from_properties(properties: Dict) -> set:
     return tables
 
 
-def get_bookmark_for_table(table, properties, db_engine, dbname=None):
-    """Get actual bookmark for a specific table used for LOG_BASED or INCREMENTAL
-    replications
-    """
-    bookmark = {}
-
+def get_metadata_for_table(
+    table: str, properties: Dict, dbname: Optional[str] = None
+) -> Dict:
     # Find table from properties and get bookmark based on replication method
     for stream in properties.get('streams', []):
         metadata = stream.get('metadata', [])
         table_name = stream.get('table_name', stream['stream'])
-
         # Get table specific metadata i.e. replication method, replication key, etc.
         table_meta = next(
             (
@@ -129,8 +125,6 @@ def get_bookmark_for_table(table, properties, db_engine, dbname=None):
         ).get('metadata')
         db_name = table_meta.get('database-name')
         schema_name = table_meta.get('schema-name')
-        replication_method = table_meta.get('replication-method')
-        replication_key = table_meta.get('replication-key')
 
         fully_qualified_table_name = (
             '{}.{}'.format(schema_name or db_name, table_name)
@@ -141,18 +135,31 @@ def get_bookmark_for_table(table, properties, db_engine, dbname=None):
         if (
             dbname is None or db_name == dbname
         ) and fully_qualified_table_name == table:
-            # Log based replication: get mysql binlog position
-            if replication_method == 'LOG_BASED':
-                bookmark = db_engine.fetch_current_log_pos()
+            return table_meta
+    return {}
 
-            # Key based incremental replication: Get max replication key from source
-            elif replication_method == 'INCREMENTAL':
-                bookmark = db_engine.fetch_current_incremental_key_pos(
-                    fully_qualified_table_name, replication_key
-                )
 
-            break
+def get_bookmark_for_table(table, properties, db_engine, dbname=None):
+    """Get actual bookmark for a specific table used for LOG_BASED or INCREMENTAL
+    replications
+    """
+    bookmark = {}
 
+    table_meta = get_metadata_for_table(table, properties, dbname=dbname)
+    replication_method = table_meta.get('replication-method')
+    replication_key = table_meta.get('replication-key')
+
+    # Log based replication: get mysql binlog position
+    if replication_method == 'LOG_BASED':
+        bookmark = db_engine.fetch_current_log_pos()
+
+    # Key based incremental replication: Get max replication key from source
+    elif replication_method == 'INCREMENTAL':
+        bookmark = db_engine.fetch_current_incremental_key_pos(
+            table, replication_key
+        )
+    if 'partition-by' in table_meta:
+        bookmark['partition_by'] = table_meta['partition-by']
     return bookmark
 
 

diff --git a/pipelinewise/fastsync/mongodb_to_bigquery.py b/pipelinewise/fastsync/mongodb_to_bigquery.py
@@ -50,9 +50,9 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]:
     bigquery = FastSyncTargetBigquery(args.target, args.transform)
     tap_id = args.target.get('tap_id')
     archive_load_files = args.target.get('archive_load_files', False)
+    dbname = args.tap.get('dbname')
 
     try:
-        dbname = args.tap.get('dbname')
         filename = 'pipelinewise_fastsync_{}_{}_{}.csv'.format(
             dbname, table, time.strftime('%Y%m%d-%H%M%S')
         )
@@ -81,7 +81,14 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]:
 
         # Creating temp table in Bigquery
         bigquery.create_schema(target_schema)
-        bigquery.create_table(target_schema, table, bigquery_columns, primary_key, is_temporary=True)
+        bigquery.create_table(
+            target_schema,
+            table,
+            bigquery_columns,
+            primary_key,
+            is_temporary=True,
+            partition_by=bookmark.get('partition_by'),
+        )
 
         # Load into Bigquery table
         bigquery.copy_to_table(
@@ -99,7 +106,13 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]:
         bigquery.obfuscate_columns(target_schema, table)
 
         # Create target table and swap with the temp table in Bigquery
-        bigquery.create_table(target_schema, table, bigquery_columns, primary_key)
+        bigquery.create_table(
+            target_schema,
+            table,
+            bigquery_columns,
+            primary_key,
+            partition_by=bookmark.get('partition_by'),
+        )
         bigquery.swap_tables(target_schema, table)
 
         # Save bookmark to singer state file

diff --git a/pipelinewise/fastsync/mysql_to_bigquery.py b/pipelinewise/fastsync/mysql_to_bigquery.py
@@ -111,7 +111,14 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]:
 
         # Creating temp table in Bigquery
         bigquery.create_schema(target_schema)
-        bigquery.create_table(target_schema, table, bigquery_columns, primary_key, is_temporary=True)
+        bigquery.create_table(
+            target_schema,
+            table,
+            bigquery_columns,
+            primary_key,
+            is_temporary=True,
+            partition_by=bookmark.get('partition_by'),
+        )
 
         # Load into Bigquery table
         bigquery.copy_to_table(
@@ -130,7 +137,13 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]:
         bigquery.obfuscate_columns(target_schema, table)
 
         # Create target table and swap with the temp table in Bigquery
-        bigquery.create_table(target_schema, table, bigquery_columns, primary_key)
+        bigquery.create_table(
+            target_schema,
+            table,
+            bigquery_columns,
+            primary_key,
+            partition_by=bookmark.get('partition_by'),
+        )
         bigquery.swap_tables(target_schema, table)
 
         # Save bookmark to singer state file

diff --git a/pipelinewise/fastsync/postgres_to_bigquery.py b/pipelinewise/fastsync/postgres_to_bigquery.py
@@ -70,9 +70,9 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]:
     bigquery = FastSyncTargetBigquery(args.target, args.transform)
     tap_id = args.target.get('tap_id')
     archive_load_files = args.target.get('archive_load_files', False)
+    dbname = args.tap.get('dbname')
 
     try:
-        dbname = args.tap.get('dbname')
         filename = 'pipelinewise_fastsync_{}_{}_{}.csv'.format(
             dbname, table, time.strftime('%Y%m%d-%H%M%S')
         )
@@ -113,7 +113,14 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]:
 
         # Creating temp table in Bigquery
         bigquery.create_schema(target_schema)
-        bigquery.create_table(target_schema, table, bigquery_columns, primary_key, is_temporary=True)
+        bigquery.create_table(
+            target_schema,
+            table,
+            bigquery_columns,
+            primary_key,
+            is_temporary=True,
+            partition_by=bookmark.get('partition_by'),
+        )
 
         # Load into Bigquery table
         bigquery.copy_to_table(
@@ -132,7 +139,13 @@ def sync_table(table: str, args: Namespace) -> Union[bool, str]:
         bigquery.obfuscate_columns(target_schema, table)
 
         # Create target table and swap with the temp table in Bigquery
-        bigquery.create_table(target_schema, table, bigquery_columns, primary_key)
+        bigquery.create_table(
+            target_schema,
+            table,
+            bigquery_columns,
+            primary_key,
+            partition_by=bookmark.get('partition_by'),
+        )
         bigquery.swap_tables(target_schema, table)
 
         # Save bookmark to singer state file

diff --git a/pipelinewise/fastsync/s3_csv_to_bigquery.py b/pipelinewise/fastsync/s3_csv_to_bigquery.py
@@ -66,6 +66,9 @@ def sync_table(table_name: str, args: Namespace) -> Union[bool, str]:
         bigquery_columns = bigquery_types.get('columns', [])
         primary_key = bigquery_types.get('primary_key', [])
 
+        # Get bookmark
+        bookmark = utils.get_bookmark_for_table(table_name, args.properties, s3_csv)
+
         # Creating temp table in Bigquery
         bigquery.create_schema(target_schema)
         bigquery.create_table(
@@ -75,6 +78,7 @@ def sync_table(table_name: str, args: Namespace) -> Union[bool, str]:
             primary_key,
             is_temporary=True,
             sort_columns=True,
+            partition_by=bookmark.get('partition_by'),
         )
 
         # Load into Bigquery table
@@ -93,12 +97,15 @@ def sync_table(table_name: str, args: Namespace) -> Union[bool, str]:
         bigquery.obfuscate_columns(target_schema, table_name)
 
         # Create target table and swap with the temp table in Bigquery
-        bigquery.create_table(target_schema, table_name, bigquery_columns, primary_key)
+        bigquery.create_table(
+            target_schema,
+            table_name,
+            bigquery_columns,
+            primary_key,
+            partition_by=bookmark.get('partition_by'),
+        )
         bigquery.swap_tables(target_schema, table_name)
 
-        # Get bookmark
-        bookmark = utils.get_bookmark_for_table(table_name, args.properties, s3_csv)
-
         # Save bookmark to singer state file
         # Lock to ensure that only one process writes the same state file at a time
         LOCK.acquire()