Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Data engineering template #45

Merged
merged 20 commits into from
Dec 2, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions contrib/templates/data-engineering/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# data-engineering template

This template introduces a new structure for organizing data-engineering
assets in DABs.

Install it using

```
databricks bundle init https://github.com/databricks/bundle-examples --template-dir contrib/templates/data-engineering --branch data-engineering
```

Note that by default this template doesn't come with any assets such as jobs or pipelines.
Follow the instructions in the template setup and README to add them.
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
{
"welcome_message": "\nWelcome to the data-engineering pipeline template!",
"properties": {
"pipeline_name": {
"type": "string",
"description": "\nPlease provide the name of the pipeline to generate.\npipeline_name",
"default": "etl_pipeline",
"order": 1
},
"format": {
"type": "string",
"description": "\nPlease select the format to use to define this pipeline.\nformat",
"order": 2,
"enum": [
"python files",
"sql files",
"notebooks"
],
"default": "python files"
},
"only_python_files_supported": {
"skip_prompt_if": {
"properties": {
"format": {
"pattern": "python files"
}
}
},
"default": "ignored",
"type": "string",
"description": "{{fail \"Only Python files are supported in this template at this time.\"}}",
"order": 3
},
"include_job": {
"type": "string",
"description": "\nWould you like to include a job that automatically triggers this pipeline?\nThis trigger will only be enabled for production deployments.\ninclude_job",
"order": 4,
"enum": [
"yes",
"no"
],
"default": "yes"
}
},
"success_message": "\n\n🪠 New pipeline definition generated under 'assets/{{.pipeline_name}}'!"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# explorations

This folder is reserved for personal, exploratory notebooks.
By default these are not committed to Git, as 'explorations' is listed in .gitignore.
pietern marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae",
"showTitle": false,
"title": ""
}
},
"outputs": [],
"source": [
"import sys\n",
"sys.path.append('../transformations')\n",
"\n",
"\n",
"spark = SparkSession.builder.getOrCreate()\n",
"spark.sql('SELECT * FROM taxi_stats').show()"
lennartkats-db marked this conversation as resolved.
Show resolved Hide resolved
]
}
],
"metadata": {
"application/vnd.databricks.v1+notebook": {
"dashboards": [],
"language": "python",
"notebookMetadata": {
"pythonIndentUnit": 2
},
"notebookName": "ipynb-notebook",
"widgets": {}
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# This is the entry point for the {{.pipeline_name}} pipeline.
# It makes sure all transformations in the transformations directory are included.
import transformations
__all__ = ["transformations"]
lennartkats-db marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import dlt
from pyspark.sql import SparkSession, DataFrame


@dlt.view(
comment="Small set of taxis for development (uses LIMIT 10)"
)
def taxis() -> DataFrame:
spark = SparkSession.builder.getOrCreate()
return spark.sql("SELECT * FROM samples.nyctaxi.trips LIMIT 10")
lennartkats-db marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import dlt
from pyspark.sql import SparkSession, DataFrame


@dlt.view
def taxis() -> DataFrame:
spark = SparkSession.builder.getOrCreate()
return spark.sql("SELECT * FROM samples.nyctaxi.trips")
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from sources.dev.taxis import taxis
from transformations import taxi_stats


def test_taxi_stats():
result = taxi_stats.filter_taxis(taxis())
assert len(result.collect()) > 5
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# __init__.py defines the 'transformations' Python package
import importlib
import pkgutil


# Import all modules in the package except those starting with '_', like '__init__.py'
for _, module_name, _ in pkgutil.iter_modules(__path__):
if not module_name.startswith("_"):
importlib.import_module(f"{__name__}.{module_name}")
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import dlt
from pyspark.sql.functions import to_date, count
from pyspark.sql import DataFrame


@dlt.table(
comment="Daily statistics of NYC Taxi trips"
)
def taxi_stats() -> DataFrame:
""" Read from the 'taxis' view from etl_pipeline/sources. """
taxis = dlt.read("taxis")

return filter_taxis(taxis)


def filter_taxis(taxis: DataFrame) -> DataFrame:
""" Group by date and calculate the number of trips. """
return (
taxis
.withColumn("pickup_date", to_date("tpep_pickup_datetime"))
.groupBy("pickup_date")
.agg(count("*").alias("number_of_trips"))
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# The job that triggers {{.pipeline_name}}.
resources:
jobs:
{{.pipeline_name}}_job:
name: {{.pipeline_name}}_job

trigger:
# Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger
periodic:
interval: 1
unit: DAYS

{{- if not is_service_principal}}

email_notifications:
lennartkats-db marked this conversation as resolved.
Show resolved Hide resolved
on_failure: ${var.notifications}

{{- end}}

tasks:
- task_key: refresh_pipeline
pipeline_task:
pipeline_id: ${resources.pipelines.{{.pipeline_name}}.id}
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
resources:
pipelines:
{{.pipeline_name}}:
name: {{.pipeline_name}}
serverless: true
{{- if or (eq default_catalog "") (eq default_catalog "hive_metastore")}}
## Specify the 'catalog' field to configure this pipeline to make use of Unity Catalog:
# catalog: ${var.catalog}
{{- else}}
catalog: ${var.catalog}
{{- end}}
target: ${var.schema}
libraries:
- file:
path: sources/${bundle.target}/*.py
- file:
path: main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"welcome_message": "\nWelcome to the data-engineering ingest-pipeline template!",
"properties": {
"pipeline_name": {
"type": "string",
"description": "\n{{fail \"The ingest-pipeline is not yet implemented.\"}}",
"order": 3
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"welcome_message": "\nWelcome to the data-engineering job resource template!",
"properties": {
"pipeline_name": {
"type": "string",
"description": "\n{{fail \"The ingest-pipeline is not yet implemented.\"}}",
lennartkats-db marked this conversation as resolved.
Show resolved Hide resolved
"order": 3
}
}
}
46 changes: 46 additions & 0 deletions contrib/templates/data-engineering/databricks_template_schema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
{
"welcome_message": "\nWelcome to the pipeline-folders template for Databricks Asset Bundles!",
"properties": {
"project_name": {
"type": "string",
"default": "my_data_project",
"description": "Please provide the following details to tailor the template to your preferences.\n\nUnique name for this project\nproject_name",
"order": 1,
"pattern": "^[A-Za-z0-9_]+$",
"pattern_match_failure_message": "Name must consist of letters, numbers, and underscores."
},
"default_catalog": {
"type": "string",
"default": "{{default_catalog}}",
"pattern": "^\\w*$",
"pattern_match_failure_message": "Invalid catalog name.",
"description": "\nPlease provide an initial catalog{{if eq (default_catalog) \"\"}} (leave blank when not using Unity Catalog){{end}}.\ndefault_catalog",
"order": 2
},
"personal_schemas": {
"type": "string",
"description": "\nWould you like to use a personal schema for each user working on this project? (e.g., 'catalog.{{short_name}}')\npersonal_schemas",
"enum": [
"yes, use a schema based on the current user name during development",
"no, use a shared schema during development"
],
"order": 3
},
"shared_schema": {
"skip_prompt_if": {
"properties": {
"personal_schemas": {
"const": "yes, use a schema based on the current user name during development"
}
}
},
"type": "string",
"default": "default",
"pattern": "^\\w+$",
"pattern_match_failure_message": "Invalid schema name.",
"description": "\nPlease provide an initial schema during development.\ndefault_schema",
"order": 4
}
},
"success_message": "\n\nYour new project has been created in the '{{.project_name}}' directory!\n\nTo add an example asset to your project, use\n\n $ cd {{.project_name}}\n $ databricks bundle init https://github.com/databricks/bundle-examples --template-dir contrib/templates/data-engineering/assets/etl-pipeline --branch data-engineering\n\nRefer to the README.md file for full \"getting started\" instructions!"
lennartkats-db marked this conversation as resolved.
Show resolved Hide resolved
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Typings for Pylance in Visual Studio Code
# see https://github.com/microsoft/pyright/blob/main/docs/builtins.md
from databricks.sdk.runtime import *
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"recommendations": [
"databricks.databricks",
"ms-python.vscode-pylance",
"redhat.vscode-yaml"
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{
"python.analysis.stubPath": ".vscode",
"databricks.python.envFile": "${workspaceFolder}/.env",
"jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\<codecell\\>|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])",
"jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------",
"python.testing.pytestArgs": [
"."
],
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true,
{{- /* Unfortunately extraPaths doesn't support globs!! See: https://github.com/microsoft/pylance-release/issues/973 */}}
"python.analysis.extraPaths": ["assets/etl_pipeline"],
"files.exclude": {
"**/*.egg-info": true,
"**/__pycache__": true,
".pytest_cache": true,
},
"[python]": {
"editor.defaultFormatter": "ms-python.black-formatter",
"editor.formatOnSave": true,
},
}
Loading