Skip to content

Commit

Permalink
fix: snake_case conversion and tabs/comma options
Browse files Browse the repository at this point in the history
- fix snake case conversion function (it worked only with strict camelCase input before)
- added an option to use tabs instead of 4 spaces in SQL indentation (was tabs before)
- added an option to control whether comma are added at the start or end of line for SQL columns list (was leading comma before)
  • Loading branch information
H-Max committed Jul 17, 2024
1 parent c81f4ba commit 2ed5943
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 21 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ __pycache__/
_local

# Output directory of the package itself
/target
target/

# Distribution / packaging
build/
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ bq2dbt myproject.mydataset
| `--suffix` | Suffix to add to column names (default: None) |
| `--output` | Destination folder for scripts. (default: target/bq2dbt) |
| `--empty_description` | Add empty description property to YAML file if field description is empty (placeholder) |
| `--tabs` | Use tabs instead of 4 spaces in SQL file indentation |
| `--leading_comma` | Put comma at the start of line in SQL file column list instead of end of line |

# TODO

Expand Down
60 changes: 41 additions & 19 deletions bq2dbt/bq2dbt.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import yaml

from google.cloud import bigquery
from unidecode import unidecode

logging.basicConfig(
level=logging.INFO,
Expand All @@ -22,20 +23,29 @@

logger = logging.getLogger(__name__)
case_convert_regex = re.compile(r'(?<!^)(?=[A-Z])')
SQL_INDENTATION = "\t"


def convert_to_snake_case(input_string: str) -> str:
"""
Converts a string from CamelCase to snake_case.
Args:
input_string (str): The CamelCase string to be converted.
Returns:
str: The string converted to snake_case.
"""
return case_convert_regex.sub('_', input_string).lower()
# Convert all non letter or numbers to "_"
output_string = re.sub(r'[^a-zA-Z0-9]', '_', input_string)
# Convert all accentuated characters to non-accentuated version
output_string = unidecode(output_string)
# Split on _ to have an array of words and remove empty strings
words = [_ for _ in output_string.split('_') if _]

# For each word in the array
output_array = []
for word in words:
new_word = ''
previous_char: str = ''
for char in word:
# If char is uppercase and previous one is lowercase, add a _
if char.isupper() and previous_char and previous_char.islower():
new_word += '_'
new_word += char.lower()
previous_char = char
output_array.append(new_word)

return '_'.join(output_array)


def parse_command_line():
Expand All @@ -47,6 +57,8 @@ def parse_command_line():
help="Include empty description property in YAML file")
parser.add_argument("--prefix", help="Prefix to add to columns names", default=None)
parser.add_argument("--suffix", help="Suffix to add to column names", default=None)
parser.add_argument("--leading_comma", help="Add comma at the start if line in SQL columns ouput", action="store_true")
parser.add_argument("--tabs", help="Indent SQL with tabs instead of spaces", action="store_true")
parser.add_argument("--output", help="Output folder of scripts. By default 'target/bq2dbt'",
default='target/bq2dbt')
return parser.parse_args()
Expand Down Expand Up @@ -83,6 +95,9 @@ def bq2dbt():
prefix = args.prefix
suffix = args.suffix
empty_description = args.empty_description
leading_comma = args.leading_comma

sql_indentation = '\t' if args.tabs else ' '

output_folder = args.output

Expand Down Expand Up @@ -194,11 +209,17 @@ def bq2dbt():
logger.info("Require partition filter : %s", table_require_partition_filter)
if table_time_partitioning:
yaml_data['models'][0]['config']['partition_by'] = {
"field": table_time_partitioning.field,
"granularity": table_time_partitioning.type_,
"data_type": field_types[table_time_partitioning.field]
"granularity": table_time_partitioning.type_
}

if table_time_partitioning.field:
yaml_data['models'][0]['config']['partition_by'].update(
{
"field": table_time_partitioning.field,
"data_type": field_types[table_time_partitioning.field]
}
)

if table_require_partition_filter:
yaml_data['models'][0]['config']['require_partition_filter'] = True

Expand All @@ -214,9 +235,10 @@ def bq2dbt():
yaml_output = yaml.dump(yaml_data, default_flow_style=False, sort_keys=False)

# Generate the SQL output
sql_columns_statement = f"\n{SQL_INDENTATION}, ".join(sql_columns)
sql_from_statement = f"\n{SQL_INDENTATION}`{project_id}.{dataset_id}.{table_id}`"
sql_output = (f"SELECT\n{SQL_INDENTATION}{sql_columns_statement}\nFROM{sql_from_statement}"
sql_columns_separator = f'\n{sql_indentation}, ' if leading_comma else f',\n{sql_indentation}'
sql_columns_statement = sql_columns_separator.join(sql_columns)
sql_from_statement = f"\n{sql_indentation}`{project_id}.{dataset_id}.{table_id}`"
sql_output = (f"SELECT\n{sql_indentation}{sql_columns_statement}\nFROM{sql_from_statement}"
f" -- Replace this with ref() or source() macro\n")

output_path = f"./{output_folder}/{project_id}/{dataset_id}"
Expand All @@ -234,4 +256,4 @@ def bq2dbt():


if __name__ == "__main__":
bq2dbt()
bq2dbt()
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ requires-python = ">=3.9"
license = {text = "GNU General Public Licence v3"}
dependencies = [
"PyYAML==6.0.1",
"google-cloud-bigquery==3.11.4"
"google-cloud-bigquery==3.11.4",
"unidecode==1.3.8"
]
dynamic = ["version"]

Expand Down

0 comments on commit 2ed5943

Please sign in to comment.