fix: snake_case conversion and tabs/comma options

- fix snake case conversion function (it worked only with strict camelCase input before) - added an option to use tabs instead of 4 spaces in SQL indentation (was tabs before) - added an option to control whether comma are added at the start or end of line for SQL columns list (was leading comma before)
H-Max · Jul 17, 2024 · 2ed5943 · 2ed5943
1 parent c81f4ba
commit 2ed5943
Show file tree

Hide file tree

Showing 4 changed files with 46 additions and 21 deletions.
diff --git a/.gitignore b/.gitignore
@@ -11,7 +11,7 @@ __pycache__/
 _local
 
 # Output directory of the package itself
-/target
+target/
 
 # Distribution / packaging
 build/

diff --git a/README.md b/README.md
@@ -63,6 +63,8 @@ bq2dbt myproject.mydataset
 | `--suffix`            | Suffix to add to column names (default: None)                                           |
 | `--output`            | Destination folder for scripts. (default: target/bq2dbt)                                |
 | `--empty_description` | Add empty description property to YAML file if field description is empty (placeholder) |
+| `--tabs`              | Use tabs instead of 4 spaces in SQL file indentation                                    |
+| `--leading_comma`     | Put comma at the start of line in SQL file column list instead of end of line           |
 
 # TODO
 

diff --git a/bq2dbt/bq2dbt.py b/bq2dbt/bq2dbt.py
@@ -14,6 +14,7 @@
 import yaml
 
 from google.cloud import bigquery
+from unidecode import unidecode
 
 logging.basicConfig(
     level=logging.INFO,
@@ -22,20 +23,29 @@
 
 logger = logging.getLogger(__name__)
 case_convert_regex = re.compile(r'(?<!^)(?=[A-Z])')
-SQL_INDENTATION = "\t"
-
 
 def convert_to_snake_case(input_string: str) -> str:
-    """
-    Converts a string from CamelCase to snake_case.
-
-    Args:
-        input_string (str): The CamelCase string to be converted.
-
-    Returns:
-        str: The string converted to snake_case.
-    """
-    return case_convert_regex.sub('_', input_string).lower()
+    # Convert all non letter or numbers to "_"
+    output_string = re.sub(r'[^a-zA-Z0-9]', '_', input_string)
+    # Convert all accentuated characters to non-accentuated version
+    output_string = unidecode(output_string)
+    # Split on _ to have an array of words and remove empty strings
+    words = [_ for _ in output_string.split('_') if _]
+
+    # For each word in the array
+    output_array = []
+    for word in words:
+        new_word = ''
+        previous_char: str = ''
+        for char in word:
+            # If char is uppercase and previous one is lowercase, add a _
+            if char.isupper() and previous_char and previous_char.islower():
+                new_word += '_'
+            new_word += char.lower()
+            previous_char = char
+        output_array.append(new_word)
+
+    return '_'.join(output_array)
 
 
 def parse_command_line():
@@ -47,6 +57,8 @@ def parse_command_line():
                         help="Include empty description property in YAML file")
     parser.add_argument("--prefix", help="Prefix to add to columns names", default=None)
     parser.add_argument("--suffix", help="Suffix to add to column names", default=None)
+    parser.add_argument("--leading_comma", help="Add comma at the start if line in SQL columns ouput", action="store_true")
+    parser.add_argument("--tabs", help="Indent SQL with tabs instead of spaces", action="store_true")
     parser.add_argument("--output", help="Output folder of scripts. By default 'target/bq2dbt'",
                         default='target/bq2dbt')
     return parser.parse_args()
@@ -83,6 +95,9 @@ def bq2dbt():
     prefix = args.prefix
     suffix = args.suffix
     empty_description = args.empty_description
+    leading_comma = args.leading_comma
+
+    sql_indentation = '\t' if args.tabs else '    '
 
     output_folder = args.output
 
@@ -194,11 +209,17 @@ def bq2dbt():
         logger.info("Require partition filter : %s", table_require_partition_filter)
         if table_time_partitioning:
             yaml_data['models'][0]['config']['partition_by'] = {
-                "field": table_time_partitioning.field,
-                "granularity": table_time_partitioning.type_,
-                "data_type": field_types[table_time_partitioning.field]
+                "granularity": table_time_partitioning.type_
             }
 
+            if table_time_partitioning.field:
+                yaml_data['models'][0]['config']['partition_by'].update(
+                    {
+                        "field": table_time_partitioning.field,
+                        "data_type": field_types[table_time_partitioning.field]
+                    }
+                )
+
             if table_require_partition_filter:
                 yaml_data['models'][0]['config']['require_partition_filter'] = True
 
@@ -214,9 +235,10 @@ def bq2dbt():
         yaml_output = yaml.dump(yaml_data, default_flow_style=False, sort_keys=False)
 
         # Generate the SQL output
-        sql_columns_statement = f"\n{SQL_INDENTATION}, ".join(sql_columns)
-        sql_from_statement = f"\n{SQL_INDENTATION}`{project_id}.{dataset_id}.{table_id}`"
-        sql_output = (f"SELECT\n{SQL_INDENTATION}{sql_columns_statement}\nFROM{sql_from_statement}"
+        sql_columns_separator = f'\n{sql_indentation}, ' if leading_comma else f',\n{sql_indentation}'
+        sql_columns_statement = sql_columns_separator.join(sql_columns)
+        sql_from_statement = f"\n{sql_indentation}`{project_id}.{dataset_id}.{table_id}`"
+        sql_output = (f"SELECT\n{sql_indentation}{sql_columns_statement}\nFROM{sql_from_statement}"
                       f"  -- Replace this with ref() or source() macro\n")
 
         output_path = f"./{output_folder}/{project_id}/{dataset_id}"
@@ -234,4 +256,4 @@ def bq2dbt():
 
 
 if __name__ == "__main__":
-    bq2dbt()
+    bq2dbt()
diff --git a/pyproject.toml b/pyproject.toml
@@ -15,7 +15,8 @@ requires-python = ">=3.9"
 license = {text = "GNU General Public Licence v3"}
 dependencies = [
     "PyYAML==6.0.1",
-    "google-cloud-bigquery==3.11.4"
+    "google-cloud-bigquery==3.11.4",
+    "unidecode==1.3.8"
 ]
 dynamic = ["version"]
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,7 +11,7 @@ __pycache__/ @@
     _local
     # Output directory of the package itself
-    /target
+    target/
     # Distribution / packaging
     build/
@@ Expand Down @@