From d750f35f7b389de12a6d2358a674b7e19a5c70bc Mon Sep 17 00:00:00 2001 From: Rafid K Date: Thu, 25 Jan 2024 22:48:35 +0000 Subject: [PATCH 01/11] Docker Compose setup for Airflow 2.8.0 --- images/airflow/2.8.0/Dockerfile | 20 +- images/airflow/2.8.0/Dockerfile.j2 | 12 +- ....sh => 100-install-needed-dnf-packages.sh} | 0 ...-folder.sh => 100-chown-airflow-folder.sh} | 0 .../200-install-debugging-tools.sh | 8 + images/airflow/2.8.0/build.sh | 2 +- images/airflow/2.8.0/dags/hello_world.py | 25 +++ images/airflow/2.8.0/docker-compose.yaml | 99 ++++++++++ images/airflow/2.8.0/entrypoint.py | 23 --- images/airflow/2.8.0/entrypoint.sh | 22 --- images/airflow/2.8.0/install_pip_packages.sh | 16 ++ images/airflow/2.8.0/plugins/.gitkeep | 0 images/airflow/2.8.0/python/README.md | 4 + images/airflow/2.8.0/python/mwaa/__init__.py | 0 .../2.8.0/python/mwaa/config/__init__.py | 0 .../2.8.0/python/mwaa/config/airflow.py | 57 ++++++ .../airflow/2.8.0/python/mwaa/config/aws.py | 15 ++ .../2.8.0/python/mwaa/config/celery.py | 39 ++++ .../2.8.0/python/mwaa/config/database.py | 38 ++++ .../airflow/2.8.0/python/mwaa/config/sqs.py | 114 ++++++++++++ .../airflow/2.8.0/python/mwaa/entrypoint.py | 175 ++++++++++++++++++ images/airflow/2.8.0/run.sh.template | 18 ++ 22 files changed, 626 insertions(+), 61 deletions(-) rename images/airflow/2.8.0/bootstrap/01-root-firstpass/{999-install-needed-dnf-packages.sh => 100-install-needed-dnf-packages.sh} (100%) rename images/airflow/2.8.0/bootstrap/03-root-secondpass/{999-chown-airflow-folder.sh => 100-chown-airflow-folder.sh} (100%) create mode 100644 images/airflow/2.8.0/bootstrap/03-root-secondpass/200-install-debugging-tools.sh create mode 100644 images/airflow/2.8.0/dags/hello_world.py create mode 100644 images/airflow/2.8.0/docker-compose.yaml delete mode 100644 images/airflow/2.8.0/entrypoint.py delete mode 100644 images/airflow/2.8.0/entrypoint.sh create mode 100755 images/airflow/2.8.0/install_pip_packages.sh create mode 100644 images/airflow/2.8.0/plugins/.gitkeep create mode 100644 images/airflow/2.8.0/python/README.md create mode 100644 images/airflow/2.8.0/python/mwaa/__init__.py create mode 100644 images/airflow/2.8.0/python/mwaa/config/__init__.py create mode 100644 images/airflow/2.8.0/python/mwaa/config/airflow.py create mode 100644 images/airflow/2.8.0/python/mwaa/config/aws.py create mode 100644 images/airflow/2.8.0/python/mwaa/config/celery.py create mode 100644 images/airflow/2.8.0/python/mwaa/config/database.py create mode 100644 images/airflow/2.8.0/python/mwaa/config/sqs.py create mode 100644 images/airflow/2.8.0/python/mwaa/entrypoint.py create mode 100755 images/airflow/2.8.0/run.sh.template diff --git a/images/airflow/2.8.0/Dockerfile b/images/airflow/2.8.0/Dockerfile index 110aff5..24080f8 100644 --- a/images/airflow/2.8.0/Dockerfile +++ b/images/airflow/2.8.0/Dockerfile @@ -3,7 +3,7 @@ # the Jinja2-templated Dockerfile.j2 file, so you need to change that file # instead. # -# This file was generated on 2024-01-18 23:09:54.111286 +# This file was generated on 2024-01-25 22:38:38.940576 # FROM public.ecr.aws/amazonlinux/amazonlinux:2023 @@ -47,7 +47,7 @@ RUN /bootstrap/01-root-firstpass/004-create-airflow-user.sh RUN /bootstrap/01-root-firstpass/005-install-aws-cli.sh -RUN /bootstrap/01-root-firstpass/999-install-needed-dnf-packages.sh +RUN /bootstrap/01-root-firstpass/100-install-needed-dnf-packages.sh #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< @@ -90,7 +90,9 @@ USER root RUN /bootstrap/03-root-secondpass/001-create-mwaa-dir.sh -RUN /bootstrap/03-root-secondpass/999-chown-airflow-folder.sh +RUN /bootstrap/03-root-secondpass/100-chown-airflow-folder.sh + +RUN /bootstrap/03-root-secondpass/200-install-debugging-tools.sh #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< @@ -104,19 +106,19 @@ RUN rm -rf /bootstrap # is created by the `001-create-mwaa-dir.sh` script. VOLUME ["${MWAA_HOME}"] -# TODO We should only expose this port if the comand is 'webserver'. +# TODO We should only expose this port if the command is 'webserver'. EXPOSE 8080 ENV PATH=${PATH_AIRFLOW_USER} +ENV PYTHONPATH="/python" WORKDIR ${AIRFLOW_USER_HOME} -COPY entrypoint.py /entrypoint.py -COPY entrypoint.sh /entrypoint.sh -RUN chmod +x /entrypoint.sh +# Copy python files. +COPY ./python /python USER airflow -ENTRYPOINT ["/entrypoint.sh"] +ENTRYPOINT ["python3", "-m", "mwaa.entrypoint"] -CMD /bin/bash \ No newline at end of file +CMD shell \ No newline at end of file diff --git a/images/airflow/2.8.0/Dockerfile.j2 b/images/airflow/2.8.0/Dockerfile.j2 index 17e0abb..1531c6a 100644 --- a/images/airflow/2.8.0/Dockerfile.j2 +++ b/images/airflow/2.8.0/Dockerfile.j2 @@ -84,19 +84,19 @@ RUN rm -rf /bootstrap # is created by the `001-create-mwaa-dir.sh` script. VOLUME ["${MWAA_HOME}"] -# TODO We should only expose this port if the comand is 'webserver'. +# TODO We should only expose this port if the command is 'webserver'. EXPOSE 8080 ENV PATH=${PATH_AIRFLOW_USER} +ENV PYTHONPATH="/python" WORKDIR ${AIRFLOW_USER_HOME} -COPY entrypoint.py /entrypoint.py -COPY entrypoint.sh /entrypoint.sh -RUN chmod +x /entrypoint.sh +# Copy python files. +COPY ./python /python USER airflow -ENTRYPOINT ["/entrypoint.sh"] +ENTRYPOINT ["python3", "-m", "mwaa.entrypoint"] -CMD /bin/bash \ No newline at end of file +CMD shell \ No newline at end of file diff --git a/images/airflow/2.8.0/bootstrap/01-root-firstpass/999-install-needed-dnf-packages.sh b/images/airflow/2.8.0/bootstrap/01-root-firstpass/100-install-needed-dnf-packages.sh similarity index 100% rename from images/airflow/2.8.0/bootstrap/01-root-firstpass/999-install-needed-dnf-packages.sh rename to images/airflow/2.8.0/bootstrap/01-root-firstpass/100-install-needed-dnf-packages.sh diff --git a/images/airflow/2.8.0/bootstrap/03-root-secondpass/999-chown-airflow-folder.sh b/images/airflow/2.8.0/bootstrap/03-root-secondpass/100-chown-airflow-folder.sh similarity index 100% rename from images/airflow/2.8.0/bootstrap/03-root-secondpass/999-chown-airflow-folder.sh rename to images/airflow/2.8.0/bootstrap/03-root-secondpass/100-chown-airflow-folder.sh diff --git a/images/airflow/2.8.0/bootstrap/03-root-secondpass/200-install-debugging-tools.sh b/images/airflow/2.8.0/bootstrap/03-root-secondpass/200-install-debugging-tools.sh new file mode 100644 index 0000000..beee3bd --- /dev/null +++ b/images/airflow/2.8.0/bootstrap/03-root-secondpass/200-install-debugging-tools.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +# This is a conditional bootstrapping step to install tools that help with +# debugging, but shouldn't be installed in production. +# TODO Currently, we are always executing this step. In the near future, we +# will update the build process to make this step conditional on a user flag. + +dnf install -y vim \ No newline at end of file diff --git a/images/airflow/2.8.0/build.sh b/images/airflow/2.8.0/build.sh index 55bcf9e..32a0bd3 100755 --- a/images/airflow/2.8.0/build.sh +++ b/images/airflow/2.8.0/build.sh @@ -3,4 +3,4 @@ set -e python3 generate-dockerfile.py -docker build ./ \ No newline at end of file +docker build -t "amazon-mwaa/airflow:2.8.0" ./ diff --git a/images/airflow/2.8.0/dags/hello_world.py b/images/airflow/2.8.0/dags/hello_world.py new file mode 100644 index 0000000..cbd8a51 --- /dev/null +++ b/images/airflow/2.8.0/dags/hello_world.py @@ -0,0 +1,25 @@ +# Python imports +from datetime import datetime, timedelta + +# Airflow imports. +from airflow import DAG +from airflow.decorators import task + +with DAG( + dag_id='hello_world_dag', + schedule_interval=timedelta(minutes=1), + dagrun_timeout=timedelta(minutes=5), + start_date=datetime(2024, 1, 1), + catchup=False, + is_paused_upon_creation=True, +) as dag: + + @task(task_id="print_task") + def hello_world(): + print("Hello, World!") + + hello_world() + + +if __name__ == "__main__": + dag.cli() diff --git a/images/airflow/2.8.0/docker-compose.yaml b/images/airflow/2.8.0/docker-compose.yaml new file mode 100644 index 0000000..3af4d77 --- /dev/null +++ b/images/airflow/2.8.0/docker-compose.yaml @@ -0,0 +1,99 @@ +version: "3.8" + +x-airflow-common: &airflow-common + image: amazon-mwaa/airflow:2.8.0 + container_name: mwaa-280-db + restart: always + environment: + # AWS credentials + AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID} + AWS_SECRET_ACCESS_KEY: ${AWS_SECRET_ACCESS_KEY} + AWS_SESSION_TOKEN: ${AWS_SESSION_TOKEN} + AWS_REGION: ${AWS_REGION} + AWS_DEFAULT_REGION: ${AWS_REGION} + + # Database configuration + MWAA__DB__POSTGRES_HOST: "postgres" + MWAA__DB__POSTGRES_PORT: "5432" + MWAA__DB__POSTGRES_USER: "airflow" + MWAA__DB__POSTGRES_PASSWORD: "airflow" + MWAA__DB__POSTGRES_DB: "airflow" + + # SQS configuration + MWAA__SQS__QUEUE_URL: ${MWAA__SQS__QUEUE_URL} + + volumes: + - ./dags:/usr/local/airflow/dags + - ./plugins:/usr/local/airflow/plugins + depends_on: &airflow-common-depends-on + postgres: + condition: service_healthy + +services: + postgres: + image: postgres:13 + container_name: mwaa-280-db + environment: + POSTGRES_USER: airflow + POSTGRES_PASSWORD: airflow + POSTGRES_DB: airflow + volumes: + - postgres-db-volume:/var/lib/postgresql/data + healthcheck: + test: ["CMD", "pg_isready", "-U", "airflow"] + interval: 10s + retries: 5 + start_period: 5s + restart: always + ports: + - 5432:5432 + expose: + - 5432 + + # TODO Support a local SQS server to allow the user to use this Docker Compose file without a real AWS account. + # + # sqs: + # image: softwaremill/elasticmq:latest + # healthcheck: + # # https://github.com/softwaremill/elasticmq/issues/776#issuecomment-1582527921 + # test: ["CMD-SHELL", "wget -q -S -O - 127.0.0.1:9324/?Action=ListQueues"] + # interval: 10s + # retries: 5 + # start_period: 5s + # ports: + # - 9324:9324 + # - 9325:9325 + # expose: + # - 9324 + # - 9325 + + # TODO Create a local CloudWatch endpoint to allow the customer to use this Docker Compose file without a real AWS account. + # TODO Create a local CloudWatch Metrics endpoint to allow the customer to use this Docker Compose file without a real AWS account. + + spy: + <<: *airflow-common + command: spy + container_name: mwaa-280-spy + + webserver: + <<: *airflow-common + command: webserver + container_name: mwaa-280-webserver + ports: + - 8080:8080 + expose: + - 8080 + + scheduler: + <<: *airflow-common + command: scheduler + container_name: mwaa-280-scheduler + + worker: + <<: *airflow-common + command: worker + container_name: mwaa-280-worker + +volumes: + postgres-db-volume: + name: "mwaa-280-db-volume" diff --git a/images/airflow/2.8.0/entrypoint.py b/images/airflow/2.8.0/entrypoint.py deleted file mode 100644 index 09a9872..0000000 --- a/images/airflow/2.8.0/entrypoint.py +++ /dev/null @@ -1,23 +0,0 @@ -""" -This is the entrypoint of the Docker image when running Airflow components. - -The script gets called with the Airflow component name, e.g. scheduler, as the -first and only argument. It accordingly runs the requested Airlfow component -after setting up the necessary configurations. -""" - -import sys - - -def main() -> None: - """Entrypoint of the script.""" - print("Warming the Docker container.") - print(sys.argv) - # TODO Not yet implemented - - -if __name__ == '__main__': - main() -else: - print('This module cannot be imported.') - sys.exit(1) diff --git a/images/airflow/2.8.0/entrypoint.sh b/images/airflow/2.8.0/entrypoint.sh deleted file mode 100644 index 9ee8093..0000000 --- a/images/airflow/2.8.0/entrypoint.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash -set -e - -# Check for the command and execute the corresponding Airflow component -case "$1" in - webserver) - exec python3 /entrypoint.py webserver - ;; - scheduler) - exec python3 /entrypoint.py scheduler - ;; - worker) - exec python3 /entrypoint.py worker - ;; - shell) - exec /bin/bash - ;; - *) - echo 'Error: Invalid command or no command is provided. Valid commands are: "webserver", "scheduler", "worker", or "shell".' - exit 1 - ;; -esac diff --git a/images/airflow/2.8.0/install_pip_packages.sh b/images/airflow/2.8.0/install_pip_packages.sh new file mode 100755 index 0000000..ec8dd18 --- /dev/null +++ b/images/airflow/2.8.0/install_pip_packages.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +AIRFLOW_AMAZON_PROVIDERS_VERSION=8.13.0 +AIRFLOW_VERSION=2.8.0 +PYTHON_MAJOR_MINOR_VERSION=3.11 + +CONSTRAINT_FILE="https://raw.githubusercontent.com/apache/airflow/constraints-${AIRFLOW_VERSION}/constraints-${PYTHON_MAJOR_MINOR_VERSION}.txt" +pip3 install --constraint "${CONSTRAINT_FILE}" \ + autopep8 \ + jinja2 \ + pycurl \ + psycopg2 \ + "celery[sqs]" \ + "apache-airflow[celery,statsd]==${AIRFLOW_VERSION}" \ + "apache-airflow-providers-amazon[aiobotocore]==${AIRFLOW_AMAZON_PROVIDERS_VERSION}" \ + watchtower \ No newline at end of file diff --git a/images/airflow/2.8.0/plugins/.gitkeep b/images/airflow/2.8.0/plugins/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/images/airflow/2.8.0/python/README.md b/images/airflow/2.8.0/python/README.md new file mode 100644 index 0000000..8496950 --- /dev/null +++ b/images/airflow/2.8.0/python/README.md @@ -0,0 +1,4 @@ +This folder gets copied over to the Docker image under the `/python` path. +Additionally, the path `/path` is added to the `PYTHONENVIRONMENT` environment +variable. As such, all the Python files under this folder are importable from +any python code. diff --git a/images/airflow/2.8.0/python/mwaa/__init__.py b/images/airflow/2.8.0/python/mwaa/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/images/airflow/2.8.0/python/mwaa/config/__init__.py b/images/airflow/2.8.0/python/mwaa/config/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/images/airflow/2.8.0/python/mwaa/config/airflow.py b/images/airflow/2.8.0/python/mwaa/config/airflow.py new file mode 100644 index 0000000..c123a86 --- /dev/null +++ b/images/airflow/2.8.0/python/mwaa/config/airflow.py @@ -0,0 +1,57 @@ +from typing import Dict + +from mwaa.config.database import get_db_connection_string +from mwaa.config.sqs import get_sqs_endpoint, get_sqs_queue_name + + +def get_airflow_db_config() -> Dict: + """ + Retrieves the environment variables required to set the necessary Airflow + configurations under the "database" section. + """ + conn_string = get_db_connection_string() + return { + "AIRFLOW__DATABASE__SQL_ALCHEMY_CONN": conn_string, + } + + +def get_airflow_celery_config() -> Dict: + """ + Retrieves the environment variables required to set the necessary Airflow + configurations for using Celery (mostly under the "database" section, but + other sections as well.) + """ + return { + "AIRFLOW__CELERY_BROKER_TRANSPORT_OPTIONS__VISIBILITY_TIMEOUT": + "43200", + "AIRFLOW__CELERY__BROKER_URL": get_sqs_endpoint(), + "AIRFLOW__CELERY__CELERY_CONFIG_OPTIONS": + "mwaa.config.celery.MWAA_CELERY_CONFIG", + "AIRFLOW__CELERY__RESULT_BACKEND": f"db+{get_db_connection_string()}", + "AIRFLOW__CELERY__WORKER_ENABLE_REMOTE_CONTROL": "False", + "AIRFLOW__CORE__EXECUTOR": "CeleryExecutor", + # Not a Celery config per-se, but is used by the Celery executor. + "AIRFLOW__OPERATORS__DEFAULT_QUEUE": get_sqs_queue_name(), + } + + +def get_airflow_core_config() -> Dict: + """ + Retrieves the environment variables required to set the necessary Airflow + configurations under the "core" section. + """ + return { + "AIRFLOW__CORE__LOAD_EXAMPLES": "False", + } + + +def get_airflow_config() -> Dict: + """ + Retrieves the environment variables required to set the necessary Airflow + configurations. + """ + return { + **get_airflow_core_config(), + **get_airflow_db_config(), + **get_airflow_celery_config(), + } diff --git a/images/airflow/2.8.0/python/mwaa/config/aws.py b/images/airflow/2.8.0/python/mwaa/config/aws.py new file mode 100644 index 0000000..46586cf --- /dev/null +++ b/images/airflow/2.8.0/python/mwaa/config/aws.py @@ -0,0 +1,15 @@ +import os + + +def get_aws_region(): + """ + Retrieves the AWS region the container should communicate with. This is + assumed to be available in either the AWS_REGION or AWS_DEFAULT_REGION + environment variables, checked respectively. + """ + region = os.environ.get('AWS_REGION') or \ + os.environ.get('AWS_DEFAULT_REGION') + if region: + return region + else: + raise RuntimeError("Region must be specified.") diff --git a/images/airflow/2.8.0/python/mwaa/config/celery.py b/images/airflow/2.8.0/python/mwaa/config/celery.py new file mode 100644 index 0000000..7506387 --- /dev/null +++ b/images/airflow/2.8.0/python/mwaa/config/celery.py @@ -0,0 +1,39 @@ +# Python imports +import copy + +# 3rd party imports +from airflow.providers.celery.executors.default_celery import ( + DEFAULT_CELERY_CONFIG +) + +# Our import +from mwaa.config.aws import get_aws_region +from mwaa.config.sqs import get_sqs_queue_name, get_sqs_queue_url + + +def create_celery_config(): + """ + Generate the configuration that will be passed to Celery. This is used in + the "celery" section of the Airflow configuration. + """ + + # We use Airflow's default condfiguration and make the changes we want. + celery_config = copy.deepcopy(DEFAULT_CELERY_CONFIG) + celery_config = { + **celery_config, + "broker_transport_options": { + **celery_config["broker_transport_options"], + "predefined_queues": { + get_sqs_queue_name(): { + "url": get_sqs_queue_url() + } + }, + "is_secure": True, + "region": get_aws_region(), + }, + } + + return celery_config + + +MWAA_CELERY_CONFIG = create_celery_config() diff --git a/images/airflow/2.8.0/python/mwaa/config/database.py b/images/airflow/2.8.0/python/mwaa/config/database.py new file mode 100644 index 0000000..0fae6ea --- /dev/null +++ b/images/airflow/2.8.0/python/mwaa/config/database.py @@ -0,0 +1,38 @@ +import os +from operator import itemgetter + + +def get_db_connection_string() -> str: + """ + Retrieves the connection string to use for communicating with metadata + database. + """ + + env_vars_names = [ + "MWAA__DB__POSTGRES_HOST", + "MWAA__DB__POSTGRES_PORT", + "MWAA__DB__POSTGRES_USER", + "MWAA__DB__POSTGRES_PASSWORD", + "MWAA__DB__POSTGRES_DB", + ] + try: + ( + postgres_host, + postgres_port, + postgres_user, + postgres_password, + postgres_db, + ) = itemgetter(*env_vars_names)(os.environ) + except Exception as e: + raise RuntimeError( + 'One or more of the required of the required environment ' + + 'variables for configuring Postgres are not set. Please ' + + 'ensure you set all the following environment variables: ' + + f'{", ".join(env_vars_names)}. This was the result of the ' + + f'following exception: {e}') + + protocol = "postgresql+psycopg2" + creds = f"{postgres_user}:{postgres_password}" + addr = f"{postgres_host}:{postgres_port}" + # TODO We need to do what is the necessary to enforce 'require'. + return f'{protocol}://{creds}@{addr}/{postgres_db}?sslmode=prefer' diff --git a/images/airflow/2.8.0/python/mwaa/config/sqs.py b/images/airflow/2.8.0/python/mwaa/config/sqs.py new file mode 100644 index 0000000..b04a0b6 --- /dev/null +++ b/images/airflow/2.8.0/python/mwaa/config/sqs.py @@ -0,0 +1,114 @@ +# Python imports +import os +from urllib.parse import urlparse, urlunparse + +# 3rd party imports +import boto3 + +# Our imports +from mwaa.config.aws import get_aws_region + + +def _change_protocol_to_sqs(url) -> str: + """ + Make the given SQS endpoint Celery friendly by setting the URL protocol + to sqs://. + + Notice that there is no such thing as SQS protocol, but this is the + URL convention that Celery uses to understand that the given URL is for + an SQS queue. + """ + + parsed_url = urlparse(url) + + # Check if the scheme was missing and was defaulted to 'http' + if parsed_url.netloc == '': + # Scheme is missing, netloc is actually part of the path. + # See the documentation for urlparse() if you don't understand the + # reasoning. + new_netloc = parsed_url.path + new_path = '' + else: + # Scheme is present. + new_netloc = parsed_url.netloc + new_path = parsed_url.path + + return urlunparse(parsed_url._replace( + scheme='sqs', + netloc=new_netloc, + path=new_path + )) + + +def _get_sqs_default_endpoint(): + """ + Retrieves the default SQS endpoint for the current AWS region. + """ + + # Create a session with the specified region + session = boto3.Session(region_name=get_aws_region()) + + # Create an SQS client from this session + sqs = session.client('sqs') + + # Return the endpoint URL + return sqs.meta.endpoint_url + + +def get_sqs_endpoint() -> str: + """ + Retrieves the SQS endpoint to communicate with. The user can specify the + endpoint via the optional `MWAA_CONFIG__CUSTOM_SQS_ENDPOINT` environment + variable. Otherwise, the default endpoint for the current AWS region is + used. + """ + return _change_protocol_to_sqs( + os.environ.get('MWAA__SQS__CUSTOM_ENDPOINT') + or _get_sqs_default_endpoint() + ) + + +def _get_queue_name_from_url(queue_url) -> str: + """ + Extracts the queue name from an Amazon SQS queue URL. + + :param queue_url: The URL of the SQS queue. + + :return: The name of the queue or None if the URL is invalid. + """ + try: + # Validate the protocol. + if not queue_url.startswith("http://") and \ + not queue_url.startswith("https://"): + raise ValueError( + f"URL {queue_url} is should start with http:// or https://") + + parts = queue_url.split('/') + + if len(parts) < 2: + raise ValueError( + f"URL {queue_url} is invalid.") + + return parts[-1] + except Exception as e: + raise RuntimeError(f"Failed to extract queue name. Erorr: {e}") + + +def get_sqs_queue_url() -> str: + """ + Retrieves the URL of the SQS queue specified for use with Celery. + """ + env_var_name = 'MWAA__SQS__QUEUE_URL' + if env_var_name not in os.environ: + raise RuntimeError( + "The name of the SQS queue to use should be specified in an " + + f"environment variable called '{env_var_name}.'" + ) + return os.environ.get(env_var_name) # type: ignore + + +def get_sqs_queue_name() -> str: + """ + Retrieves the name of the SQS queue specified for use with Celery. + """ + return _get_queue_name_from_url(get_sqs_queue_url()) diff --git a/images/airflow/2.8.0/python/mwaa/entrypoint.py b/images/airflow/2.8.0/python/mwaa/entrypoint.py new file mode 100644 index 0000000..ac0e7e3 --- /dev/null +++ b/images/airflow/2.8.0/python/mwaa/entrypoint.py @@ -0,0 +1,175 @@ +""" +This is the entrypoint of the Docker image when running Airflow components. + +The script gets called with the Airflow component name, e.g. scheduler, as the +first and only argument. It accordingly runs the requested Airlfow component +after setting up the necessary configurations. +""" + +# Python imports +import os +import sys +import time +import subprocess + + +# 3rd party imports +from sqlalchemy import create_engine, text +from sqlalchemy.engine import Engine + +# Our imports +from mwaa.config.airflow import get_airflow_config, get_db_connection_string + + +def abort(err_msg: str, exit_code=1) -> None: + print(err_msg) + sys.exit(exit_code) + + +AVAILABLE_COMMANDS = [ + 'webserver', + 'scheduler', + 'worker', + 'triggerer', + 'shell', + 'spy', +] + + +def verify_versions(): + major, minor, micro, *_ = sys.version_info + assert os.environ['PYTHON_VERSION'] == f'{major}.{minor}.{micro}' + + +def db_lock(lock_id, timeout=300 * 1000): + def db_lock_specific(func): + def wrapped(*args, **kwargs): + func_name = func.__name__ + db_engine: Engine = create_engine( + get_db_connection_string()) # type: ignore + print(f'Obtaining lock for {func_name}...') + with db_engine.connect() as conn: + try: + conn.execute(text("SET LOCK_TIMEOUT to :timeout"), + {"timeout": timeout}) + conn.execute(text("SELECT pg_advisory_lock(:id)"), + {"id": lock_id}) + print(f'Obtained lock for {func_name}.') + + try: + func(*args, **kwargs) + except Exception as e: + abort( + f'Failed while executing {func_name}. ' + + f'Error: {e}.' + ) + except Exception as e: + abort( + f'Failed to obtain DB lock for {func_name}. ' + + f'Error: {e}.' + ) + finally: + print(f'Releasing lock for {func_name}...') + conn.execute("SET LOCK_TIMEOUT TO DEFAULT") + conn.execute(text("SELECT pg_advisory_unlock(:id)"), { + "id": lock_id}) + print(f'Released lock for {func_name}') + return wrapped + return db_lock_specific + + +@db_lock(1234) +def airflow_db_init(environ): + print("Calling 'airflow db migrate' to initialize the database.") + response = subprocess.run(["airflow db migrate"], + shell=True, check=True, text=True, env=environ) + + if response.returncode: + raise RuntimeError(f'Failed to migrate db. Error: {response.stderr}') + + +@db_lock(5678) +def create_www_user(environ): + print("Calling 'airflow users create' to create the webserver user.") + response = subprocess.run(' '.join([ + "airflow", "users", "create", + "--username", "airflow", + "--firstname", "Airflow", + "--lastname", "Admin", + "--email", "airflow@example.com", + "--role", "Admin", + "--password", "airflow", + ]), shell=True, check=True, text=True, env=environ) + + if response.returncode: + raise RuntimeError(f'Failed to create user. Error: {response.stderr}') + + +def export_env_variables(environ): + # Get the home directory of the current user + home_dir = os.path.expanduser("~") + bashrc_path = os.path.join(home_dir, ".bashrc") + bash_profile_path = os.path.join(home_dir, ".bash_profile") + + # Environment variables to append + env_vars_to_append = [ + # TODO Need to escape value. + f'export {key}="{value}"\n' + for key, value in environ.items() + ] + + # Append to .bashrc + with open(bashrc_path, "a") as bashrc: + bashrc.writelines(env_vars_to_append) + + # Append to .bash_profile + with open(bash_profile_path, "a") as bash_profile: + bash_profile.writelines(env_vars_to_append) + + +def main() -> None: + """Entrypoint of the script.""" + + try: + (_, command, ) = sys.argv + if command not in AVAILABLE_COMMANDS: + exit(f'Invalid command: {command}. ' + + f'Use one of {", ".join(AVAILABLE_COMMANDS)}.') + except Exception as e: + print(sys.argv) + exit('Invalid arguments. Please provide one argument with one of' + + f'the values: {", ".join(AVAILABLE_COMMANDS)}. Error was {e}.') + + print(f"Warming a Docker container for an Airflow {command}.") + + # Add the necessary environment variables. + environ = { + **os.environ, + **get_airflow_config() + } + + airflow_db_init(environ) + create_www_user(environ) + + # Export the environment variables to .bashrc and .bash_profile to enable + # users to run a shell on the container and have the necessary environment + # variables set for using airflow CLI. + export_env_variables(environ) + + match command: + case 'shell': + os.execlpe('/bin/bash', '/bin/bash', environ) + case 'spy': + while True: + time.sleep(1) + case 'worker': + os.execlpe('airflow', 'airflow', 'celery', 'worker', environ) + case _: + os.execlpe('airflow', 'airflow', command, environ) + + +if __name__ == '__main__': + main() +else: + print('This module cannot be imported.') + sys.exit(1) diff --git a/images/airflow/2.8.0/run.sh.template b/images/airflow/2.8.0/run.sh.template new file mode 100755 index 0000000..6126b24 --- /dev/null +++ b/images/airflow/2.8.0/run.sh.template @@ -0,0 +1,18 @@ +#!/bin/bash +set -e + +# You can use this template to run the Docker Compose setup with your personal +# AWS account. Create a copy of this file and fill in the placeholderss. + +export AWS_ACCOUNT_ID= +export AWS_REGION= +export AWS_ACCESS_KEY_ID= +export AWS_SECRET_ACCESS_KEY= +export AWS_SESSION_TOKEN= + +export MWAA__SQS__QUEUE_URL="The URL of the SQS key to use with Celery>" + +# Build the Docker image +./build.sh + +docker compose up \ No newline at end of file From da1df8ece2e30d623d049c46297e7c8f8603c5cc Mon Sep 17 00:00:00 2001 From: Rafid K Date: Tue, 30 Jan 2024 08:50:50 -0800 Subject: [PATCH 02/11] Support 'explorer' and 'dev' images (#30) To aid with development, I am introducing different image types, controlled by the following two build arguments: - `build_type`: This build argument has three different possible values: - `standard`: This is the standard build type. it is what customer uses. - `explorer`: The 'explorer' build type is almost identical to the 'standard' build type but it doesn't include the entrypoint. This is useful for debugging purposes to run the image and look around its content without starting airflow, which might require further setup. - `explorer-root`: This is similar to the 'explorer' build type, but additionally uses the root user, giving the user of this Docker image elevated permissions. The user can, thus, install packages, remove packages, or anything else. - `dev`: When this build argument is set to True, it will result in additional packages being installed to aid with development. For each combination of these two build arguments, a different Docker image is generated. Thus, we are currently generating these images: - `amazon-mwaa/airflow:2.8.0` - `amazon-mwaa/airflow:2.8.0-dev` - `amazon-mwaa/airflow:2.8.0-explorer` - `amazon-mwaa/airflow:2.8.0-explorer-dev` - `amazon-mwaa/airflow:2.8.0-explorer-privileged` - `amazon-mwaa/airflow:2.8.0-explorer-privileged-dev` --- images/airflow/2.8.0/.gitignore | 1 + images/airflow/2.8.0/Dockerfile.j2 | 26 +++- .../2.8.0/{ => Dockerfiles}/Dockerfile | 8 +- .../airflow/2.8.0/Dockerfiles/Dockerfile-dev | 126 ++++++++++++++++++ .../2.8.0/Dockerfiles/Dockerfile-explorer | 122 +++++++++++++++++ .../2.8.0/Dockerfiles/Dockerfile-explorer-dev | 124 +++++++++++++++++ .../Dockerfile-explorer-privileged | 122 +++++++++++++++++ .../Dockerfile-explorer-privileged-dev | 124 +++++++++++++++++ ...ls.sh => 200-devonly-install-dev-tools.sh} | 2 - images/airflow/2.8.0/build.sh | 21 ++- images/airflow/2.8.0/generate-dockerfile.py | 63 ++++++++- images/airflow/2.8.0/python/README.md | 2 +- .../2.8.0/python/mwaa/config/airflow.py | 2 +- .../2.8.0/python/mwaa/config/database.py | 6 +- images/airflow/2.8.0/run.sh.template | 0 15 files changed, 732 insertions(+), 17 deletions(-) create mode 100644 images/airflow/2.8.0/.gitignore rename images/airflow/2.8.0/{ => Dockerfiles}/Dockerfile (96%) create mode 100644 images/airflow/2.8.0/Dockerfiles/Dockerfile-dev create mode 100644 images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer create mode 100644 images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-dev create mode 100644 images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged create mode 100644 images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged-dev rename images/airflow/2.8.0/bootstrap/03-root-secondpass/{200-install-debugging-tools.sh => 200-devonly-install-dev-tools.sh} (51%) mode change 100755 => 100644 images/airflow/2.8.0/run.sh.template diff --git a/images/airflow/2.8.0/.gitignore b/images/airflow/2.8.0/.gitignore new file mode 100644 index 0000000..245773f --- /dev/null +++ b/images/airflow/2.8.0/.gitignore @@ -0,0 +1 @@ +run.sh diff --git a/images/airflow/2.8.0/Dockerfile.j2 b/images/airflow/2.8.0/Dockerfile.j2 index 1531c6a..d4bf425 100644 --- a/images/airflow/2.8.0/Dockerfile.j2 +++ b/images/airflow/2.8.0/Dockerfile.j2 @@ -95,8 +95,32 @@ WORKDIR ${AIRFLOW_USER_HOME} # Copy python files. COPY ./python /python +{% if build_type == 'standard' %} +{# This is the standard build type. it is what customer uses.#} USER airflow ENTRYPOINT ["python3", "-m", "mwaa.entrypoint"] -CMD shell \ No newline at end of file +CMD shell +{% elif build_type == 'explorer' %} +{# +The 'explorer' build type is almost identical to the 'standard' build type but +it doesn't include the entrypoint. This is useful for debugging purposes to run +the image and look around its content without starting airflow, which might +require further setup. +#} +USER airflow + +ENTRYPOINT ["/bin/bash"] +{% elif build_type == 'explorer-privileged' %} +{# +This is similar to the 'explorer' build type, but additionally uses the root +user, giving the user of this Docker image elevated permissions. The user can, +thus, install packages, remove packages, or anything else. +#} +USER root + +ENTRYPOINT ["/bin/bash"] +{% else %} +{{ raise("Invalid build type.") }} +{% endif %} \ No newline at end of file diff --git a/images/airflow/2.8.0/Dockerfile b/images/airflow/2.8.0/Dockerfiles/Dockerfile similarity index 96% rename from images/airflow/2.8.0/Dockerfile rename to images/airflow/2.8.0/Dockerfiles/Dockerfile index 24080f8..2c79647 100644 --- a/images/airflow/2.8.0/Dockerfile +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile @@ -3,7 +3,7 @@ # the Jinja2-templated Dockerfile.j2 file, so you need to change that file # instead. # -# This file was generated on 2024-01-25 22:38:38.940576 +# This file was generated on 2024-01-30 16:34:46.229453 # FROM public.ecr.aws/amazonlinux/amazonlinux:2023 @@ -92,8 +92,6 @@ RUN /bootstrap/03-root-secondpass/001-create-mwaa-dir.sh RUN /bootstrap/03-root-secondpass/100-chown-airflow-folder.sh -RUN /bootstrap/03-root-secondpass/200-install-debugging-tools.sh - #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # END marker for root user, second pass bootstrapping steps. @@ -117,8 +115,10 @@ WORKDIR ${AIRFLOW_USER_HOME} # Copy python files. COPY ./python /python + + USER airflow ENTRYPOINT ["python3", "-m", "mwaa.entrypoint"] -CMD shell \ No newline at end of file +CMD shell diff --git a/images/airflow/2.8.0/Dockerfiles/Dockerfile-dev b/images/airflow/2.8.0/Dockerfiles/Dockerfile-dev new file mode 100644 index 0000000..1cfb1e0 --- /dev/null +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile-dev @@ -0,0 +1,126 @@ +# +# WARNING: Don't change this file manually. This file is auto-generated from +# the Jinja2-templated Dockerfile.j2 file, so you need to change that file +# instead. +# +# This file was generated on 2024-01-30 16:34:46.218794 +# + +FROM public.ecr.aws/amazonlinux/amazonlinux:2023 + +# Environment variables +ENV AIRFLOW_AMAZON_PROVIDERS_VERSION=8.13.0 +ENV AIRFLOW_USER_HOME=/usr/local/airflow +ENV AIRFLOW_HOME=${AIRFLOW_USER_HOME} +ENV AIRFLOW_VERSION=2.8.0 +ENV MWAA_HOME=/usr/local/mwaa +ENV PYTHON_VERSION=3.11.7 + +# We don't want those variables to stay in the final image, so we use ARG instead of ENV. +ARG PATH_DEFAULT=${PATH} +ARG PATH_AIRFLOW_USER=${AIRFLOW_USER_HOME}/.local/bin:${PATH_DEFAULT} +ARG PYTHON_MD5_CHECKSUM=d96c7e134c35a8c46236f8a0e566b69c +ARG MARIADB_DOWNLOAD_BASE_URL=https://mirror.mariadb.org/yum/11.1/fedora38-amd64/rpms +ARG MARIADB_RPM_COMMON=MariaDB-common-11.1.2-1.fc38.x86_64.rpm +ARG MARIADB_RPM_COMMON_CHECKSUM=e87371d558efa97724f3728fb214cf19 +ARG MARIADB_RPM_DEVEL=MariaDB-devel-11.1.2-1.fc38.x86_64.rpm +ARG MARIADB_RPM_DEVEL_CHECKSUM=cfce6e9b53f4e4fb1cb14f1ed720c92c +ARG MARIADB_RPM_SHARED=MariaDB-shared-11.1.2-1.fc38.x86_64.rpm +ARG MARIADB_RPM_SHARED_CHECKSUM=ed82ad5bc5b35cb2719a9471a71c6cdb + +# Copy bootstrapping files. +COPY ./bootstrap /bootstrap +RUN chmod -R +x /bootstrap + +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +# BEGINNING marker for root user, first pass bootstrapping steps. +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + + +RUN /bootstrap/01-root-firstpass/001-init.sh + +RUN /bootstrap/01-root-firstpass/002-install-python.sh + +RUN /bootstrap/01-root-firstpass/003-install-mariadb.sh + +RUN /bootstrap/01-root-firstpass/004-create-airflow-user.sh + +RUN /bootstrap/01-root-firstpass/005-install-aws-cli.sh + +RUN /bootstrap/01-root-firstpass/100-install-needed-dnf-packages.sh + + +#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +# END marker for root user, first pass bootstrapping steps. +#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + + +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +# > BEGINNING marker for airflow user bootstrapping steps. +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + +# Switch to 'airflow' user and update the PATH environment variable. +USER airflow +ENV PATH=${PATH_AIRFLOW_USER} + + +RUN /bootstrap/02-airflow/001-install-airflow.sh + + +# Revert the PATH and user. +ENV PATH=${PATH_DEFAULT} +USER root + +#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +# > END marker for airflow user bootstrapping steps. +#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + + +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +# BEGINNING marker for root user, second pass bootstrapping steps. +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + +# Those steps are also executed as the root user. However, they rely on the +# successfull execution of the airflow user bootstrapping steps. For example, +# giving ownership of the Airflow home user to the 'airflow' user requires the +# the folder to be fully setup first. + + +RUN /bootstrap/03-root-secondpass/001-create-mwaa-dir.sh + +RUN /bootstrap/03-root-secondpass/100-chown-airflow-folder.sh + +RUN /bootstrap/03-root-secondpass/200-devonly-install-dev-tools.sh + + +#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +# END marker for root user, second pass bootstrapping steps. +#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# remove bootstrapping files. +RUN rm -rf /bootstrap + +# Create a volume for syncing files with the sidecar. The actual folder +# is created by the `001-create-mwaa-dir.sh` script. +VOLUME ["${MWAA_HOME}"] + +# TODO We should only expose this port if the command is 'webserver'. +EXPOSE 8080 + +ENV PATH=${PATH_AIRFLOW_USER} +ENV PYTHONPATH="/python" + +WORKDIR ${AIRFLOW_USER_HOME} + +# Copy python files. +COPY ./python /python + + + +USER airflow + +ENTRYPOINT ["python3", "-m", "mwaa.entrypoint"] + +CMD shell diff --git a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer new file mode 100644 index 0000000..a228c8f --- /dev/null +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer @@ -0,0 +1,122 @@ +# +# WARNING: Don't change this file manually. This file is auto-generated from +# the Jinja2-templated Dockerfile.j2 file, so you need to change that file +# instead. +# +# This file was generated on 2024-01-30 16:34:46.232966 +# + +FROM public.ecr.aws/amazonlinux/amazonlinux:2023 + +# Environment variables +ENV AIRFLOW_AMAZON_PROVIDERS_VERSION=8.13.0 +ENV AIRFLOW_USER_HOME=/usr/local/airflow +ENV AIRFLOW_HOME=${AIRFLOW_USER_HOME} +ENV AIRFLOW_VERSION=2.8.0 +ENV MWAA_HOME=/usr/local/mwaa +ENV PYTHON_VERSION=3.11.7 + +# We don't want those variables to stay in the final image, so we use ARG instead of ENV. +ARG PATH_DEFAULT=${PATH} +ARG PATH_AIRFLOW_USER=${AIRFLOW_USER_HOME}/.local/bin:${PATH_DEFAULT} +ARG PYTHON_MD5_CHECKSUM=d96c7e134c35a8c46236f8a0e566b69c +ARG MARIADB_DOWNLOAD_BASE_URL=https://mirror.mariadb.org/yum/11.1/fedora38-amd64/rpms +ARG MARIADB_RPM_COMMON=MariaDB-common-11.1.2-1.fc38.x86_64.rpm +ARG MARIADB_RPM_COMMON_CHECKSUM=e87371d558efa97724f3728fb214cf19 +ARG MARIADB_RPM_DEVEL=MariaDB-devel-11.1.2-1.fc38.x86_64.rpm +ARG MARIADB_RPM_DEVEL_CHECKSUM=cfce6e9b53f4e4fb1cb14f1ed720c92c +ARG MARIADB_RPM_SHARED=MariaDB-shared-11.1.2-1.fc38.x86_64.rpm +ARG MARIADB_RPM_SHARED_CHECKSUM=ed82ad5bc5b35cb2719a9471a71c6cdb + +# Copy bootstrapping files. +COPY ./bootstrap /bootstrap +RUN chmod -R +x /bootstrap + +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +# BEGINNING marker for root user, first pass bootstrapping steps. +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + + +RUN /bootstrap/01-root-firstpass/001-init.sh + +RUN /bootstrap/01-root-firstpass/002-install-python.sh + +RUN /bootstrap/01-root-firstpass/003-install-mariadb.sh + +RUN /bootstrap/01-root-firstpass/004-create-airflow-user.sh + +RUN /bootstrap/01-root-firstpass/005-install-aws-cli.sh + +RUN /bootstrap/01-root-firstpass/100-install-needed-dnf-packages.sh + + +#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +# END marker for root user, first pass bootstrapping steps. +#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + + +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +# > BEGINNING marker for airflow user bootstrapping steps. +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + +# Switch to 'airflow' user and update the PATH environment variable. +USER airflow +ENV PATH=${PATH_AIRFLOW_USER} + + +RUN /bootstrap/02-airflow/001-install-airflow.sh + + +# Revert the PATH and user. +ENV PATH=${PATH_DEFAULT} +USER root + +#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +# > END marker for airflow user bootstrapping steps. +#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + + +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +# BEGINNING marker for root user, second pass bootstrapping steps. +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + +# Those steps are also executed as the root user. However, they rely on the +# successfull execution of the airflow user bootstrapping steps. For example, +# giving ownership of the Airflow home user to the 'airflow' user requires the +# the folder to be fully setup first. + + +RUN /bootstrap/03-root-secondpass/001-create-mwaa-dir.sh + +RUN /bootstrap/03-root-secondpass/100-chown-airflow-folder.sh + + +#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +# END marker for root user, second pass bootstrapping steps. +#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# remove bootstrapping files. +RUN rm -rf /bootstrap + +# Create a volume for syncing files with the sidecar. The actual folder +# is created by the `001-create-mwaa-dir.sh` script. +VOLUME ["${MWAA_HOME}"] + +# TODO We should only expose this port if the command is 'webserver'. +EXPOSE 8080 + +ENV PATH=${PATH_AIRFLOW_USER} +ENV PYTHONPATH="/python" + +WORKDIR ${AIRFLOW_USER_HOME} + +# Copy python files. +COPY ./python /python + + + +USER airflow + +ENTRYPOINT ["/bin/bash"] diff --git a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-dev b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-dev new file mode 100644 index 0000000..97e17fe --- /dev/null +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-dev @@ -0,0 +1,124 @@ +# +# WARNING: Don't change this file manually. This file is auto-generated from +# the Jinja2-templated Dockerfile.j2 file, so you need to change that file +# instead. +# +# This file was generated on 2024-01-30 16:34:46.222403 +# + +FROM public.ecr.aws/amazonlinux/amazonlinux:2023 + +# Environment variables +ENV AIRFLOW_AMAZON_PROVIDERS_VERSION=8.13.0 +ENV AIRFLOW_USER_HOME=/usr/local/airflow +ENV AIRFLOW_HOME=${AIRFLOW_USER_HOME} +ENV AIRFLOW_VERSION=2.8.0 +ENV MWAA_HOME=/usr/local/mwaa +ENV PYTHON_VERSION=3.11.7 + +# We don't want those variables to stay in the final image, so we use ARG instead of ENV. +ARG PATH_DEFAULT=${PATH} +ARG PATH_AIRFLOW_USER=${AIRFLOW_USER_HOME}/.local/bin:${PATH_DEFAULT} +ARG PYTHON_MD5_CHECKSUM=d96c7e134c35a8c46236f8a0e566b69c +ARG MARIADB_DOWNLOAD_BASE_URL=https://mirror.mariadb.org/yum/11.1/fedora38-amd64/rpms +ARG MARIADB_RPM_COMMON=MariaDB-common-11.1.2-1.fc38.x86_64.rpm +ARG MARIADB_RPM_COMMON_CHECKSUM=e87371d558efa97724f3728fb214cf19 +ARG MARIADB_RPM_DEVEL=MariaDB-devel-11.1.2-1.fc38.x86_64.rpm +ARG MARIADB_RPM_DEVEL_CHECKSUM=cfce6e9b53f4e4fb1cb14f1ed720c92c +ARG MARIADB_RPM_SHARED=MariaDB-shared-11.1.2-1.fc38.x86_64.rpm +ARG MARIADB_RPM_SHARED_CHECKSUM=ed82ad5bc5b35cb2719a9471a71c6cdb + +# Copy bootstrapping files. +COPY ./bootstrap /bootstrap +RUN chmod -R +x /bootstrap + +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +# BEGINNING marker for root user, first pass bootstrapping steps. +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + + +RUN /bootstrap/01-root-firstpass/001-init.sh + +RUN /bootstrap/01-root-firstpass/002-install-python.sh + +RUN /bootstrap/01-root-firstpass/003-install-mariadb.sh + +RUN /bootstrap/01-root-firstpass/004-create-airflow-user.sh + +RUN /bootstrap/01-root-firstpass/005-install-aws-cli.sh + +RUN /bootstrap/01-root-firstpass/100-install-needed-dnf-packages.sh + + +#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +# END marker for root user, first pass bootstrapping steps. +#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + + +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +# > BEGINNING marker for airflow user bootstrapping steps. +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + +# Switch to 'airflow' user and update the PATH environment variable. +USER airflow +ENV PATH=${PATH_AIRFLOW_USER} + + +RUN /bootstrap/02-airflow/001-install-airflow.sh + + +# Revert the PATH and user. +ENV PATH=${PATH_DEFAULT} +USER root + +#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +# > END marker for airflow user bootstrapping steps. +#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + + +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +# BEGINNING marker for root user, second pass bootstrapping steps. +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + +# Those steps are also executed as the root user. However, they rely on the +# successfull execution of the airflow user bootstrapping steps. For example, +# giving ownership of the Airflow home user to the 'airflow' user requires the +# the folder to be fully setup first. + + +RUN /bootstrap/03-root-secondpass/001-create-mwaa-dir.sh + +RUN /bootstrap/03-root-secondpass/100-chown-airflow-folder.sh + +RUN /bootstrap/03-root-secondpass/200-devonly-install-dev-tools.sh + + +#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +# END marker for root user, second pass bootstrapping steps. +#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# remove bootstrapping files. +RUN rm -rf /bootstrap + +# Create a volume for syncing files with the sidecar. The actual folder +# is created by the `001-create-mwaa-dir.sh` script. +VOLUME ["${MWAA_HOME}"] + +# TODO We should only expose this port if the command is 'webserver'. +EXPOSE 8080 + +ENV PATH=${PATH_AIRFLOW_USER} +ENV PYTHONPATH="/python" + +WORKDIR ${AIRFLOW_USER_HOME} + +# Copy python files. +COPY ./python /python + + + +USER airflow + +ENTRYPOINT ["/bin/bash"] diff --git a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged new file mode 100644 index 0000000..84f8a86 --- /dev/null +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged @@ -0,0 +1,122 @@ +# +# WARNING: Don't change this file manually. This file is auto-generated from +# the Jinja2-templated Dockerfile.j2 file, so you need to change that file +# instead. +# +# This file was generated on 2024-01-30 16:34:46.236417 +# + +FROM public.ecr.aws/amazonlinux/amazonlinux:2023 + +# Environment variables +ENV AIRFLOW_AMAZON_PROVIDERS_VERSION=8.13.0 +ENV AIRFLOW_USER_HOME=/usr/local/airflow +ENV AIRFLOW_HOME=${AIRFLOW_USER_HOME} +ENV AIRFLOW_VERSION=2.8.0 +ENV MWAA_HOME=/usr/local/mwaa +ENV PYTHON_VERSION=3.11.7 + +# We don't want those variables to stay in the final image, so we use ARG instead of ENV. +ARG PATH_DEFAULT=${PATH} +ARG PATH_AIRFLOW_USER=${AIRFLOW_USER_HOME}/.local/bin:${PATH_DEFAULT} +ARG PYTHON_MD5_CHECKSUM=d96c7e134c35a8c46236f8a0e566b69c +ARG MARIADB_DOWNLOAD_BASE_URL=https://mirror.mariadb.org/yum/11.1/fedora38-amd64/rpms +ARG MARIADB_RPM_COMMON=MariaDB-common-11.1.2-1.fc38.x86_64.rpm +ARG MARIADB_RPM_COMMON_CHECKSUM=e87371d558efa97724f3728fb214cf19 +ARG MARIADB_RPM_DEVEL=MariaDB-devel-11.1.2-1.fc38.x86_64.rpm +ARG MARIADB_RPM_DEVEL_CHECKSUM=cfce6e9b53f4e4fb1cb14f1ed720c92c +ARG MARIADB_RPM_SHARED=MariaDB-shared-11.1.2-1.fc38.x86_64.rpm +ARG MARIADB_RPM_SHARED_CHECKSUM=ed82ad5bc5b35cb2719a9471a71c6cdb + +# Copy bootstrapping files. +COPY ./bootstrap /bootstrap +RUN chmod -R +x /bootstrap + +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +# BEGINNING marker for root user, first pass bootstrapping steps. +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + + +RUN /bootstrap/01-root-firstpass/001-init.sh + +RUN /bootstrap/01-root-firstpass/002-install-python.sh + +RUN /bootstrap/01-root-firstpass/003-install-mariadb.sh + +RUN /bootstrap/01-root-firstpass/004-create-airflow-user.sh + +RUN /bootstrap/01-root-firstpass/005-install-aws-cli.sh + +RUN /bootstrap/01-root-firstpass/100-install-needed-dnf-packages.sh + + +#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +# END marker for root user, first pass bootstrapping steps. +#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + + +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +# > BEGINNING marker for airflow user bootstrapping steps. +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + +# Switch to 'airflow' user and update the PATH environment variable. +USER airflow +ENV PATH=${PATH_AIRFLOW_USER} + + +RUN /bootstrap/02-airflow/001-install-airflow.sh + + +# Revert the PATH and user. +ENV PATH=${PATH_DEFAULT} +USER root + +#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +# > END marker for airflow user bootstrapping steps. +#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + + +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +# BEGINNING marker for root user, second pass bootstrapping steps. +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + +# Those steps are also executed as the root user. However, they rely on the +# successfull execution of the airflow user bootstrapping steps. For example, +# giving ownership of the Airflow home user to the 'airflow' user requires the +# the folder to be fully setup first. + + +RUN /bootstrap/03-root-secondpass/001-create-mwaa-dir.sh + +RUN /bootstrap/03-root-secondpass/100-chown-airflow-folder.sh + + +#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +# END marker for root user, second pass bootstrapping steps. +#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# remove bootstrapping files. +RUN rm -rf /bootstrap + +# Create a volume for syncing files with the sidecar. The actual folder +# is created by the `001-create-mwaa-dir.sh` script. +VOLUME ["${MWAA_HOME}"] + +# TODO We should only expose this port if the command is 'webserver'. +EXPOSE 8080 + +ENV PATH=${PATH_AIRFLOW_USER} +ENV PYTHONPATH="/python" + +WORKDIR ${AIRFLOW_USER_HOME} + +# Copy python files. +COPY ./python /python + + + +USER root + +ENTRYPOINT ["/bin/bash"] diff --git a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged-dev b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged-dev new file mode 100644 index 0000000..8079639 --- /dev/null +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged-dev @@ -0,0 +1,124 @@ +# +# WARNING: Don't change this file manually. This file is auto-generated from +# the Jinja2-templated Dockerfile.j2 file, so you need to change that file +# instead. +# +# This file was generated on 2024-01-30 16:34:46.225951 +# + +FROM public.ecr.aws/amazonlinux/amazonlinux:2023 + +# Environment variables +ENV AIRFLOW_AMAZON_PROVIDERS_VERSION=8.13.0 +ENV AIRFLOW_USER_HOME=/usr/local/airflow +ENV AIRFLOW_HOME=${AIRFLOW_USER_HOME} +ENV AIRFLOW_VERSION=2.8.0 +ENV MWAA_HOME=/usr/local/mwaa +ENV PYTHON_VERSION=3.11.7 + +# We don't want those variables to stay in the final image, so we use ARG instead of ENV. +ARG PATH_DEFAULT=${PATH} +ARG PATH_AIRFLOW_USER=${AIRFLOW_USER_HOME}/.local/bin:${PATH_DEFAULT} +ARG PYTHON_MD5_CHECKSUM=d96c7e134c35a8c46236f8a0e566b69c +ARG MARIADB_DOWNLOAD_BASE_URL=https://mirror.mariadb.org/yum/11.1/fedora38-amd64/rpms +ARG MARIADB_RPM_COMMON=MariaDB-common-11.1.2-1.fc38.x86_64.rpm +ARG MARIADB_RPM_COMMON_CHECKSUM=e87371d558efa97724f3728fb214cf19 +ARG MARIADB_RPM_DEVEL=MariaDB-devel-11.1.2-1.fc38.x86_64.rpm +ARG MARIADB_RPM_DEVEL_CHECKSUM=cfce6e9b53f4e4fb1cb14f1ed720c92c +ARG MARIADB_RPM_SHARED=MariaDB-shared-11.1.2-1.fc38.x86_64.rpm +ARG MARIADB_RPM_SHARED_CHECKSUM=ed82ad5bc5b35cb2719a9471a71c6cdb + +# Copy bootstrapping files. +COPY ./bootstrap /bootstrap +RUN chmod -R +x /bootstrap + +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +# BEGINNING marker for root user, first pass bootstrapping steps. +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + + +RUN /bootstrap/01-root-firstpass/001-init.sh + +RUN /bootstrap/01-root-firstpass/002-install-python.sh + +RUN /bootstrap/01-root-firstpass/003-install-mariadb.sh + +RUN /bootstrap/01-root-firstpass/004-create-airflow-user.sh + +RUN /bootstrap/01-root-firstpass/005-install-aws-cli.sh + +RUN /bootstrap/01-root-firstpass/100-install-needed-dnf-packages.sh + + +#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +# END marker for root user, first pass bootstrapping steps. +#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + + +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +# > BEGINNING marker for airflow user bootstrapping steps. +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + +# Switch to 'airflow' user and update the PATH environment variable. +USER airflow +ENV PATH=${PATH_AIRFLOW_USER} + + +RUN /bootstrap/02-airflow/001-install-airflow.sh + + +# Revert the PATH and user. +ENV PATH=${PATH_DEFAULT} +USER root + +#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +# > END marker for airflow user bootstrapping steps. +#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + + +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +# BEGINNING marker for root user, second pass bootstrapping steps. +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + +# Those steps are also executed as the root user. However, they rely on the +# successfull execution of the airflow user bootstrapping steps. For example, +# giving ownership of the Airflow home user to the 'airflow' user requires the +# the folder to be fully setup first. + + +RUN /bootstrap/03-root-secondpass/001-create-mwaa-dir.sh + +RUN /bootstrap/03-root-secondpass/100-chown-airflow-folder.sh + +RUN /bootstrap/03-root-secondpass/200-devonly-install-dev-tools.sh + + +#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +# END marker for root user, second pass bootstrapping steps. +#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# remove bootstrapping files. +RUN rm -rf /bootstrap + +# Create a volume for syncing files with the sidecar. The actual folder +# is created by the `001-create-mwaa-dir.sh` script. +VOLUME ["${MWAA_HOME}"] + +# TODO We should only expose this port if the command is 'webserver'. +EXPOSE 8080 + +ENV PATH=${PATH_AIRFLOW_USER} +ENV PYTHONPATH="/python" + +WORKDIR ${AIRFLOW_USER_HOME} + +# Copy python files. +COPY ./python /python + + + +USER root + +ENTRYPOINT ["/bin/bash"] diff --git a/images/airflow/2.8.0/bootstrap/03-root-secondpass/200-install-debugging-tools.sh b/images/airflow/2.8.0/bootstrap/03-root-secondpass/200-devonly-install-dev-tools.sh similarity index 51% rename from images/airflow/2.8.0/bootstrap/03-root-secondpass/200-install-debugging-tools.sh rename to images/airflow/2.8.0/bootstrap/03-root-secondpass/200-devonly-install-dev-tools.sh index beee3bd..aabc129 100644 --- a/images/airflow/2.8.0/bootstrap/03-root-secondpass/200-install-debugging-tools.sh +++ b/images/airflow/2.8.0/bootstrap/03-root-secondpass/200-devonly-install-dev-tools.sh @@ -2,7 +2,5 @@ # This is a conditional bootstrapping step to install tools that help with # debugging, but shouldn't be installed in production. -# TODO Currently, we are always executing this step. In the near future, we -# will update the build process to make this step conditional on a user flag. dnf install -y vim \ No newline at end of file diff --git a/images/airflow/2.8.0/build.sh b/images/airflow/2.8.0/build.sh index 32a0bd3..b7eee52 100755 --- a/images/airflow/2.8.0/build.sh +++ b/images/airflow/2.8.0/build.sh @@ -3,4 +3,23 @@ set -e python3 generate-dockerfile.py -docker build -t "amazon-mwaa/airflow:2.8.0" ./ +#!/bin/bash + +for dev in "True" "False"; do + for build_type in "standard" "explorer" "explorer-privileged"; do + dockerfile_name="Dockerfile" + tag_name="amazon-mwaa/airflow:2.8.0" + + if [[ "$build_type" != "standard" ]]; then + dockerfile_name="${dockerfile_name}-${build_type}" + tag_name="${tag_name}-${build_type}" + fi + + if [[ "$dev" == "True" ]]; then + dockerfile_name="${dockerfile_name}-dev" + tag_name="${tag_name}-dev" + fi + + docker build -f "./Dockerfiles/${dockerfile_name}" -t "${tag_name}" ./ + done +done \ No newline at end of file diff --git a/images/airflow/2.8.0/generate-dockerfile.py b/images/airflow/2.8.0/generate-dockerfile.py index 217b0a7..3d57e87 100644 --- a/images/airflow/2.8.0/generate-dockerfile.py +++ b/images/airflow/2.8.0/generate-dockerfile.py @@ -28,11 +28,54 @@ sys.exit(1) -def main() -> None: - """Entrypoint of the script.""" +def raise_helper(msg) -> None: + """ + Helper method to enable Jinja2 templates to raise an exception. + """ + raise RuntimeError(msg) + + +def is_dev_bootstrapping_step(bootstrap_filename: str) -> bool: + """ + Determines whether the given bootstrap filename is supposed to run only in + development images. This is decided based on the prefix "devonly-" in the + filename directly following the index prefix (the numbers at the + beginning.) For example, the file `200-devonly-install-dev-tools.sh` + matches this criteria and will be only executed for building development + images. + + :param bootstrap_filename: The name of the bootstrapping file. + + :return True or False. + """ + comps = bootstrap_filename.split('-') + return len(comps) > 1 and comps[1] == 'devonly' + + +def generate_dockerfile(output_file, build_type='standard', dev=False) -> None: + """ + Generate a Dockerfile based on the given build arguments. + + :param build_type: Specifies the build type. This can have the following + values: + - standard: This is the standard build type. it is what customer uses. + - explorer: The 'explorer' build type is almost identical to the + 'standard' build type but it doesn't include the entrypoint. This is + useful for debugging purposes to run the image and look around its + content without starting airflow, which might require further setup. + - explorer-root: This is similar to the 'explorer' build type, but + additionally uses the root user, giving the user of this Docker image + elevated permissions. The user can, thus, install packages, remove + packages, or anything else. + + :param dev: Whether to produce a development image or a production one. + Development images have extra stuff that are useful during development, + e.g. editors, sudo, etc. + """ # Load Dockerfile Jinja template. file_loader = FileSystemLoader('.') env = Environment(loader=file_loader) + env.globals['raise'] = raise_helper template = env.get_template('Dockerfile.j2') # Template data @@ -40,20 +83,24 @@ def main() -> None: 'bootstrapping_scripts_root_firstpass': [ os.path.join('/bootstrap/01-root-firstpass', name).strip() for name in sorted(os.listdir('./bootstrap/01-root-firstpass')) + if not is_dev_bootstrapping_step(name) or dev is True ], 'bootstrapping_scripts_airflow': [ os.path.join('/bootstrap/02-airflow', name).strip() for name in sorted(os.listdir('./bootstrap/02-airflow')) + if not is_dev_bootstrapping_step(name) or dev is True ], 'bootstrapping_scripts_root_secondpass': [ os.path.join('/bootstrap/03-root-secondpass', name).strip() for name in sorted(os.listdir('./bootstrap/03-root-secondpass')) + if not is_dev_bootstrapping_step(name) or dev is True ], + 'build_type': build_type } # Render the template and generate the Dockerfile output = template.render(data) - with open('Dockerfile', 'w') as f: + with open(os.path.join('./Dockerfiles', output_file), 'w') as f: f.write(f''' # # WARNING: Don't change this file manually. This file is auto-generated from @@ -69,7 +116,15 @@ def main() -> None: if __name__ == '__main__': - main() + for dev in [True, False]: + for build_type in ['standard', 'explorer', 'explorer-privileged']: + dockerfile_name = 'Dockerfile' + if build_type != 'standard': + dockerfile_name = f'{dockerfile_name}-{build_type}' + if dev: + dockerfile_name = f'{dockerfile_name}-dev' + generate_dockerfile(dockerfile_name, + build_type=build_type, dev=dev) else: print('This module cannot be imported.') sys.exit(1) diff --git a/images/airflow/2.8.0/python/README.md b/images/airflow/2.8.0/python/README.md index 8496950..6f37576 100644 --- a/images/airflow/2.8.0/python/README.md +++ b/images/airflow/2.8.0/python/README.md @@ -1,4 +1,4 @@ This folder gets copied over to the Docker image under the `/python` path. -Additionally, the path `/path` is added to the `PYTHONENVIRONMENT` environment +Additionally, the path `/python` is added to the `PYTHONPATH` environment variable. As such, all the Python files under this folder are importable from any python code. diff --git a/images/airflow/2.8.0/python/mwaa/config/airflow.py b/images/airflow/2.8.0/python/mwaa/config/airflow.py index c123a86..5a76468 100644 --- a/images/airflow/2.8.0/python/mwaa/config/airflow.py +++ b/images/airflow/2.8.0/python/mwaa/config/airflow.py @@ -18,7 +18,7 @@ def get_airflow_db_config() -> Dict: def get_airflow_celery_config() -> Dict: """ Retrieves the environment variables required to set the necessary Airflow - configurations for using Celery (mostly under the "database" section, but + configurations for using Celery (mostly under the "celery" section, but other sections as well.) """ return { diff --git a/images/airflow/2.8.0/python/mwaa/config/database.py b/images/airflow/2.8.0/python/mwaa/config/database.py index 0fae6ea..5e08f6c 100644 --- a/images/airflow/2.8.0/python/mwaa/config/database.py +++ b/images/airflow/2.8.0/python/mwaa/config/database.py @@ -25,9 +25,9 @@ def get_db_connection_string() -> str: ) = itemgetter(*env_vars_names)(os.environ) except Exception as e: raise RuntimeError( - 'One or more of the required of the required environment ' + - 'variables for configuring Postgres are not set. Please ' + - 'ensure you set all the following environment variables: ' + + 'One or more of the required environment variables for ' + + 'configuring Postgres are not set. Please ensure you set all ' + + 'all the following environment variables: ' + f'{", ".join(env_vars_names)}. This was the result of the ' + f'following exception: {e}') diff --git a/images/airflow/2.8.0/run.sh.template b/images/airflow/2.8.0/run.sh.template old mode 100755 new mode 100644 From d0560030557a46e1d93ff6597a1a08568468bae0 Mon Sep 17 00:00:00 2001 From: Rafid K Date: Wed, 31 Jan 2024 00:25:03 +0000 Subject: [PATCH 03/11] Extract common stuff from the different Dockerfiles All Dockerfiles share most of the steps, apart from a couple of steps at the end. As such, to cut build time, I extracted the common steps into a separate Docker imaeg that all other images build on top of it. --- .../{Dockerfile.j2 => Dockerfile.base.j2} | 30 ---- .../airflow/2.8.0/Dockerfile.derivatives.j2 | 60 ++++++++ images/airflow/2.8.0/Dockerfiles/Dockerfile | 113 +------------- .../airflow/2.8.0/Dockerfiles/Dockerfile-dev | 110 +------------- .../2.8.0/Dockerfiles/Dockerfile-explorer | 113 +------------- .../2.8.0/Dockerfiles/Dockerfile-explorer-dev | 110 +------------- .../Dockerfile-explorer-privileged | 113 +------------- .../Dockerfile-explorer-privileged-dev | 110 +------------- .../airflow/2.8.0/Dockerfiles/Dockerfile.base | 106 ++++++++++++++ .../001-devonly-install-dev-tools.sh} | 0 images/airflow/2.8.0/build.sh | 7 +- images/airflow/2.8.0/explore-image.sh | 2 + ...-dockerfile.py => generate-dockerfiles.py} | 138 +++++++++++++----- 13 files changed, 301 insertions(+), 711 deletions(-) rename images/airflow/2.8.0/{Dockerfile.j2 => Dockerfile.base.j2} (80%) create mode 100644 images/airflow/2.8.0/Dockerfile.derivatives.j2 create mode 100644 images/airflow/2.8.0/Dockerfiles/Dockerfile.base rename images/airflow/2.8.0/{bootstrap/03-root-secondpass/200-devonly-install-dev-tools.sh => bootstrap-dev/001-devonly-install-dev-tools.sh} (100%) create mode 100755 images/airflow/2.8.0/explore-image.sh rename images/airflow/2.8.0/{generate-dockerfile.py => generate-dockerfiles.py} (63%) diff --git a/images/airflow/2.8.0/Dockerfile.j2 b/images/airflow/2.8.0/Dockerfile.base.j2 similarity index 80% rename from images/airflow/2.8.0/Dockerfile.j2 rename to images/airflow/2.8.0/Dockerfile.base.j2 index d4bf425..c8c05b8 100644 --- a/images/airflow/2.8.0/Dockerfile.j2 +++ b/images/airflow/2.8.0/Dockerfile.base.j2 @@ -94,33 +94,3 @@ WORKDIR ${AIRFLOW_USER_HOME} # Copy python files. COPY ./python /python - -{% if build_type == 'standard' %} -{# This is the standard build type. it is what customer uses.#} -USER airflow - -ENTRYPOINT ["python3", "-m", "mwaa.entrypoint"] - -CMD shell -{% elif build_type == 'explorer' %} -{# -The 'explorer' build type is almost identical to the 'standard' build type but -it doesn't include the entrypoint. This is useful for debugging purposes to run -the image and look around its content without starting airflow, which might -require further setup. -#} -USER airflow - -ENTRYPOINT ["/bin/bash"] -{% elif build_type == 'explorer-privileged' %} -{# -This is similar to the 'explorer' build type, but additionally uses the root -user, giving the user of this Docker image elevated permissions. The user can, -thus, install packages, remove packages, or anything else. -#} -USER root - -ENTRYPOINT ["/bin/bash"] -{% else %} -{{ raise("Invalid build type.") }} -{% endif %} \ No newline at end of file diff --git a/images/airflow/2.8.0/Dockerfile.derivatives.j2 b/images/airflow/2.8.0/Dockerfile.derivatives.j2 new file mode 100644 index 0000000..b499484 --- /dev/null +++ b/images/airflow/2.8.0/Dockerfile.derivatives.j2 @@ -0,0 +1,60 @@ +FROM amazon-mwaa/airflow:2.8.0-base + +{% if bootstrapping_scripts_dev %} + +# Copy bootstrapping files. +COPY ./bootstrap-dev /bootstrap-dev +RUN chmod -R +x /bootstrap-dev + +{# +Those steps are only exectued for development images. Those are images that +contain additional packages that help with debugging, e.g. editors, etc., but +are not supposed to be in production. +#} + +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +# BEGINNING marker for dev bootstrapping steps. +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + +{% for filepath in bootstrapping_scripts_dev %} +RUN {{ filepath }} +{% endfor %} + +#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +# END marker for dev bootstrapping steps. +#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# remove bootstrapping files. +RUN rm -rf /bootstrap-dev + +{% endif %} + +{% if build_type == 'standard' %} +{# This is the standard build type. it is what customer uses.#} +USER airflow + +ENTRYPOINT ["python3", "-m", "mwaa.entrypoint"] + +CMD shell +{% elif build_type == 'explorer' %} +{# +The 'explorer' build type is almost identical to the 'standard' build type but +it doesn't include the entrypoint. This is useful for debugging purposes to run +the image and look around its content without starting airflow, which might +require further setup. +#} +USER airflow + +ENTRYPOINT ["/bin/bash"] +{% elif build_type == 'explorer-privileged' %} +{# +This is similar to the 'explorer' build type, but additionally uses the root +user, giving the user of this Docker image elevated permissions. The user can, +thus, install packages, remove packages, or anything else. +#} +USER root + +ENTRYPOINT ["/bin/bash"] +{% else %} +{{ raise("Invalid build type.") }} +{% endif %} \ No newline at end of file diff --git a/images/airflow/2.8.0/Dockerfiles/Dockerfile b/images/airflow/2.8.0/Dockerfiles/Dockerfile index 2c79647..82d51aa 100644 --- a/images/airflow/2.8.0/Dockerfiles/Dockerfile +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile @@ -3,119 +3,10 @@ # the Jinja2-templated Dockerfile.j2 file, so you need to change that file # instead. # -# This file was generated on 2024-01-30 16:34:46.229453 +# This file was generated on 2024-01-31 02:49:24.657364 # -FROM public.ecr.aws/amazonlinux/amazonlinux:2023 - -# Environment variables -ENV AIRFLOW_AMAZON_PROVIDERS_VERSION=8.13.0 -ENV AIRFLOW_USER_HOME=/usr/local/airflow -ENV AIRFLOW_HOME=${AIRFLOW_USER_HOME} -ENV AIRFLOW_VERSION=2.8.0 -ENV MWAA_HOME=/usr/local/mwaa -ENV PYTHON_VERSION=3.11.7 - -# We don't want those variables to stay in the final image, so we use ARG instead of ENV. -ARG PATH_DEFAULT=${PATH} -ARG PATH_AIRFLOW_USER=${AIRFLOW_USER_HOME}/.local/bin:${PATH_DEFAULT} -ARG PYTHON_MD5_CHECKSUM=d96c7e134c35a8c46236f8a0e566b69c -ARG MARIADB_DOWNLOAD_BASE_URL=https://mirror.mariadb.org/yum/11.1/fedora38-amd64/rpms -ARG MARIADB_RPM_COMMON=MariaDB-common-11.1.2-1.fc38.x86_64.rpm -ARG MARIADB_RPM_COMMON_CHECKSUM=e87371d558efa97724f3728fb214cf19 -ARG MARIADB_RPM_DEVEL=MariaDB-devel-11.1.2-1.fc38.x86_64.rpm -ARG MARIADB_RPM_DEVEL_CHECKSUM=cfce6e9b53f4e4fb1cb14f1ed720c92c -ARG MARIADB_RPM_SHARED=MariaDB-shared-11.1.2-1.fc38.x86_64.rpm -ARG MARIADB_RPM_SHARED_CHECKSUM=ed82ad5bc5b35cb2719a9471a71c6cdb - -# Copy bootstrapping files. -COPY ./bootstrap /bootstrap -RUN chmod -R +x /bootstrap - -#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> -# BEGINNING marker for root user, first pass bootstrapping steps. -#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - - -RUN /bootstrap/01-root-firstpass/001-init.sh - -RUN /bootstrap/01-root-firstpass/002-install-python.sh - -RUN /bootstrap/01-root-firstpass/003-install-mariadb.sh - -RUN /bootstrap/01-root-firstpass/004-create-airflow-user.sh - -RUN /bootstrap/01-root-firstpass/005-install-aws-cli.sh - -RUN /bootstrap/01-root-firstpass/100-install-needed-dnf-packages.sh - - -#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -# END marker for root user, first pass bootstrapping steps. -#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - - -#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> -# > BEGINNING marker for airflow user bootstrapping steps. -#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - -# Switch to 'airflow' user and update the PATH environment variable. -USER airflow -ENV PATH=${PATH_AIRFLOW_USER} - - -RUN /bootstrap/02-airflow/001-install-airflow.sh - - -# Revert the PATH and user. -ENV PATH=${PATH_DEFAULT} -USER root - -#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -# > END marker for airflow user bootstrapping steps. -#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - - -#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> -# BEGINNING marker for root user, second pass bootstrapping steps. -#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - -# Those steps are also executed as the root user. However, they rely on the -# successfull execution of the airflow user bootstrapping steps. For example, -# giving ownership of the Airflow home user to the 'airflow' user requires the -# the folder to be fully setup first. - - -RUN /bootstrap/03-root-secondpass/001-create-mwaa-dir.sh - -RUN /bootstrap/03-root-secondpass/100-chown-airflow-folder.sh - - -#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -# END marker for root user, second pass bootstrapping steps. -#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - -# remove bootstrapping files. -RUN rm -rf /bootstrap - -# Create a volume for syncing files with the sidecar. The actual folder -# is created by the `001-create-mwaa-dir.sh` script. -VOLUME ["${MWAA_HOME}"] - -# TODO We should only expose this port if the command is 'webserver'. -EXPOSE 8080 - -ENV PATH=${PATH_AIRFLOW_USER} -ENV PYTHONPATH="/python" - -WORKDIR ${AIRFLOW_USER_HOME} - -# Copy python files. -COPY ./python /python - - +FROM amazon-mwaa/airflow:2.8.0-base USER airflow diff --git a/images/airflow/2.8.0/Dockerfiles/Dockerfile-dev b/images/airflow/2.8.0/Dockerfiles/Dockerfile-dev index 1cfb1e0..6a03ddb 100644 --- a/images/airflow/2.8.0/Dockerfiles/Dockerfile-dev +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile-dev @@ -3,121 +3,27 @@ # the Jinja2-templated Dockerfile.j2 file, so you need to change that file # instead. # -# This file was generated on 2024-01-30 16:34:46.218794 +# This file was generated on 2024-01-31 02:49:24.649403 # -FROM public.ecr.aws/amazonlinux/amazonlinux:2023 - -# Environment variables -ENV AIRFLOW_AMAZON_PROVIDERS_VERSION=8.13.0 -ENV AIRFLOW_USER_HOME=/usr/local/airflow -ENV AIRFLOW_HOME=${AIRFLOW_USER_HOME} -ENV AIRFLOW_VERSION=2.8.0 -ENV MWAA_HOME=/usr/local/mwaa -ENV PYTHON_VERSION=3.11.7 - -# We don't want those variables to stay in the final image, so we use ARG instead of ENV. -ARG PATH_DEFAULT=${PATH} -ARG PATH_AIRFLOW_USER=${AIRFLOW_USER_HOME}/.local/bin:${PATH_DEFAULT} -ARG PYTHON_MD5_CHECKSUM=d96c7e134c35a8c46236f8a0e566b69c -ARG MARIADB_DOWNLOAD_BASE_URL=https://mirror.mariadb.org/yum/11.1/fedora38-amd64/rpms -ARG MARIADB_RPM_COMMON=MariaDB-common-11.1.2-1.fc38.x86_64.rpm -ARG MARIADB_RPM_COMMON_CHECKSUM=e87371d558efa97724f3728fb214cf19 -ARG MARIADB_RPM_DEVEL=MariaDB-devel-11.1.2-1.fc38.x86_64.rpm -ARG MARIADB_RPM_DEVEL_CHECKSUM=cfce6e9b53f4e4fb1cb14f1ed720c92c -ARG MARIADB_RPM_SHARED=MariaDB-shared-11.1.2-1.fc38.x86_64.rpm -ARG MARIADB_RPM_SHARED_CHECKSUM=ed82ad5bc5b35cb2719a9471a71c6cdb +FROM amazon-mwaa/airflow:2.8.0-base # Copy bootstrapping files. -COPY ./bootstrap /bootstrap -RUN chmod -R +x /bootstrap - -#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> -# BEGINNING marker for root user, first pass bootstrapping steps. -#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - - -RUN /bootstrap/01-root-firstpass/001-init.sh - -RUN /bootstrap/01-root-firstpass/002-install-python.sh - -RUN /bootstrap/01-root-firstpass/003-install-mariadb.sh - -RUN /bootstrap/01-root-firstpass/004-create-airflow-user.sh - -RUN /bootstrap/01-root-firstpass/005-install-aws-cli.sh - -RUN /bootstrap/01-root-firstpass/100-install-needed-dnf-packages.sh - - -#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -# END marker for root user, first pass bootstrapping steps. -#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - - -#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> -# > BEGINNING marker for airflow user bootstrapping steps. -#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - -# Switch to 'airflow' user and update the PATH environment variable. -USER airflow -ENV PATH=${PATH_AIRFLOW_USER} - - -RUN /bootstrap/02-airflow/001-install-airflow.sh - - -# Revert the PATH and user. -ENV PATH=${PATH_DEFAULT} -USER root - -#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -# > END marker for airflow user bootstrapping steps. -#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - +COPY ./bootstrap-dev /bootstrap-dev +RUN chmod -R +x /bootstrap-dev #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> -# BEGINNING marker for root user, second pass bootstrapping steps. +# BEGINNING marker for dev bootstrapping steps. #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> -# Those steps are also executed as the root user. However, they rely on the -# successfull execution of the airflow user bootstrapping steps. For example, -# giving ownership of the Airflow home user to the 'airflow' user requires the -# the folder to be fully setup first. - - -RUN /bootstrap/03-root-secondpass/001-create-mwaa-dir.sh - -RUN /bootstrap/03-root-secondpass/100-chown-airflow-folder.sh - -RUN /bootstrap/03-root-secondpass/200-devonly-install-dev-tools.sh - +RUN /bootstrap-dev/001-devonly-install-dev-tools.sh #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -# END marker for root user, second pass bootstrapping steps. +# END marker for dev bootstrapping steps. #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # remove bootstrapping files. -RUN rm -rf /bootstrap - -# Create a volume for syncing files with the sidecar. The actual folder -# is created by the `001-create-mwaa-dir.sh` script. -VOLUME ["${MWAA_HOME}"] - -# TODO We should only expose this port if the command is 'webserver'. -EXPOSE 8080 - -ENV PATH=${PATH_AIRFLOW_USER} -ENV PYTHONPATH="/python" - -WORKDIR ${AIRFLOW_USER_HOME} - -# Copy python files. -COPY ./python /python - - +RUN rm -rf /bootstrap-dev USER airflow diff --git a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer index a228c8f..a19c24e 100644 --- a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer @@ -3,119 +3,10 @@ # the Jinja2-templated Dockerfile.j2 file, so you need to change that file # instead. # -# This file was generated on 2024-01-30 16:34:46.232966 +# This file was generated on 2024-01-31 02:49:24.659949 # -FROM public.ecr.aws/amazonlinux/amazonlinux:2023 - -# Environment variables -ENV AIRFLOW_AMAZON_PROVIDERS_VERSION=8.13.0 -ENV AIRFLOW_USER_HOME=/usr/local/airflow -ENV AIRFLOW_HOME=${AIRFLOW_USER_HOME} -ENV AIRFLOW_VERSION=2.8.0 -ENV MWAA_HOME=/usr/local/mwaa -ENV PYTHON_VERSION=3.11.7 - -# We don't want those variables to stay in the final image, so we use ARG instead of ENV. -ARG PATH_DEFAULT=${PATH} -ARG PATH_AIRFLOW_USER=${AIRFLOW_USER_HOME}/.local/bin:${PATH_DEFAULT} -ARG PYTHON_MD5_CHECKSUM=d96c7e134c35a8c46236f8a0e566b69c -ARG MARIADB_DOWNLOAD_BASE_URL=https://mirror.mariadb.org/yum/11.1/fedora38-amd64/rpms -ARG MARIADB_RPM_COMMON=MariaDB-common-11.1.2-1.fc38.x86_64.rpm -ARG MARIADB_RPM_COMMON_CHECKSUM=e87371d558efa97724f3728fb214cf19 -ARG MARIADB_RPM_DEVEL=MariaDB-devel-11.1.2-1.fc38.x86_64.rpm -ARG MARIADB_RPM_DEVEL_CHECKSUM=cfce6e9b53f4e4fb1cb14f1ed720c92c -ARG MARIADB_RPM_SHARED=MariaDB-shared-11.1.2-1.fc38.x86_64.rpm -ARG MARIADB_RPM_SHARED_CHECKSUM=ed82ad5bc5b35cb2719a9471a71c6cdb - -# Copy bootstrapping files. -COPY ./bootstrap /bootstrap -RUN chmod -R +x /bootstrap - -#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> -# BEGINNING marker for root user, first pass bootstrapping steps. -#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - - -RUN /bootstrap/01-root-firstpass/001-init.sh - -RUN /bootstrap/01-root-firstpass/002-install-python.sh - -RUN /bootstrap/01-root-firstpass/003-install-mariadb.sh - -RUN /bootstrap/01-root-firstpass/004-create-airflow-user.sh - -RUN /bootstrap/01-root-firstpass/005-install-aws-cli.sh - -RUN /bootstrap/01-root-firstpass/100-install-needed-dnf-packages.sh - - -#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -# END marker for root user, first pass bootstrapping steps. -#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - - -#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> -# > BEGINNING marker for airflow user bootstrapping steps. -#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - -# Switch to 'airflow' user and update the PATH environment variable. -USER airflow -ENV PATH=${PATH_AIRFLOW_USER} - - -RUN /bootstrap/02-airflow/001-install-airflow.sh - - -# Revert the PATH and user. -ENV PATH=${PATH_DEFAULT} -USER root - -#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -# > END marker for airflow user bootstrapping steps. -#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - - -#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> -# BEGINNING marker for root user, second pass bootstrapping steps. -#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - -# Those steps are also executed as the root user. However, they rely on the -# successfull execution of the airflow user bootstrapping steps. For example, -# giving ownership of the Airflow home user to the 'airflow' user requires the -# the folder to be fully setup first. - - -RUN /bootstrap/03-root-secondpass/001-create-mwaa-dir.sh - -RUN /bootstrap/03-root-secondpass/100-chown-airflow-folder.sh - - -#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -# END marker for root user, second pass bootstrapping steps. -#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - -# remove bootstrapping files. -RUN rm -rf /bootstrap - -# Create a volume for syncing files with the sidecar. The actual folder -# is created by the `001-create-mwaa-dir.sh` script. -VOLUME ["${MWAA_HOME}"] - -# TODO We should only expose this port if the command is 'webserver'. -EXPOSE 8080 - -ENV PATH=${PATH_AIRFLOW_USER} -ENV PYTHONPATH="/python" - -WORKDIR ${AIRFLOW_USER_HOME} - -# Copy python files. -COPY ./python /python - - +FROM amazon-mwaa/airflow:2.8.0-base USER airflow diff --git a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-dev b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-dev index 97e17fe..e5475cc 100644 --- a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-dev +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-dev @@ -3,121 +3,27 @@ # the Jinja2-templated Dockerfile.j2 file, so you need to change that file # instead. # -# This file was generated on 2024-01-30 16:34:46.222403 +# This file was generated on 2024-01-31 02:49:24.652091 # -FROM public.ecr.aws/amazonlinux/amazonlinux:2023 - -# Environment variables -ENV AIRFLOW_AMAZON_PROVIDERS_VERSION=8.13.0 -ENV AIRFLOW_USER_HOME=/usr/local/airflow -ENV AIRFLOW_HOME=${AIRFLOW_USER_HOME} -ENV AIRFLOW_VERSION=2.8.0 -ENV MWAA_HOME=/usr/local/mwaa -ENV PYTHON_VERSION=3.11.7 - -# We don't want those variables to stay in the final image, so we use ARG instead of ENV. -ARG PATH_DEFAULT=${PATH} -ARG PATH_AIRFLOW_USER=${AIRFLOW_USER_HOME}/.local/bin:${PATH_DEFAULT} -ARG PYTHON_MD5_CHECKSUM=d96c7e134c35a8c46236f8a0e566b69c -ARG MARIADB_DOWNLOAD_BASE_URL=https://mirror.mariadb.org/yum/11.1/fedora38-amd64/rpms -ARG MARIADB_RPM_COMMON=MariaDB-common-11.1.2-1.fc38.x86_64.rpm -ARG MARIADB_RPM_COMMON_CHECKSUM=e87371d558efa97724f3728fb214cf19 -ARG MARIADB_RPM_DEVEL=MariaDB-devel-11.1.2-1.fc38.x86_64.rpm -ARG MARIADB_RPM_DEVEL_CHECKSUM=cfce6e9b53f4e4fb1cb14f1ed720c92c -ARG MARIADB_RPM_SHARED=MariaDB-shared-11.1.2-1.fc38.x86_64.rpm -ARG MARIADB_RPM_SHARED_CHECKSUM=ed82ad5bc5b35cb2719a9471a71c6cdb +FROM amazon-mwaa/airflow:2.8.0-base # Copy bootstrapping files. -COPY ./bootstrap /bootstrap -RUN chmod -R +x /bootstrap - -#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> -# BEGINNING marker for root user, first pass bootstrapping steps. -#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - - -RUN /bootstrap/01-root-firstpass/001-init.sh - -RUN /bootstrap/01-root-firstpass/002-install-python.sh - -RUN /bootstrap/01-root-firstpass/003-install-mariadb.sh - -RUN /bootstrap/01-root-firstpass/004-create-airflow-user.sh - -RUN /bootstrap/01-root-firstpass/005-install-aws-cli.sh - -RUN /bootstrap/01-root-firstpass/100-install-needed-dnf-packages.sh - - -#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -# END marker for root user, first pass bootstrapping steps. -#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - - -#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> -# > BEGINNING marker for airflow user bootstrapping steps. -#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - -# Switch to 'airflow' user and update the PATH environment variable. -USER airflow -ENV PATH=${PATH_AIRFLOW_USER} - - -RUN /bootstrap/02-airflow/001-install-airflow.sh - - -# Revert the PATH and user. -ENV PATH=${PATH_DEFAULT} -USER root - -#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -# > END marker for airflow user bootstrapping steps. -#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - +COPY ./bootstrap-dev /bootstrap-dev +RUN chmod -R +x /bootstrap-dev #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> -# BEGINNING marker for root user, second pass bootstrapping steps. +# BEGINNING marker for dev bootstrapping steps. #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> -# Those steps are also executed as the root user. However, they rely on the -# successfull execution of the airflow user bootstrapping steps. For example, -# giving ownership of the Airflow home user to the 'airflow' user requires the -# the folder to be fully setup first. - - -RUN /bootstrap/03-root-secondpass/001-create-mwaa-dir.sh - -RUN /bootstrap/03-root-secondpass/100-chown-airflow-folder.sh - -RUN /bootstrap/03-root-secondpass/200-devonly-install-dev-tools.sh - +RUN /bootstrap-dev/001-devonly-install-dev-tools.sh #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -# END marker for root user, second pass bootstrapping steps. +# END marker for dev bootstrapping steps. #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # remove bootstrapping files. -RUN rm -rf /bootstrap - -# Create a volume for syncing files with the sidecar. The actual folder -# is created by the `001-create-mwaa-dir.sh` script. -VOLUME ["${MWAA_HOME}"] - -# TODO We should only expose this port if the command is 'webserver'. -EXPOSE 8080 - -ENV PATH=${PATH_AIRFLOW_USER} -ENV PYTHONPATH="/python" - -WORKDIR ${AIRFLOW_USER_HOME} - -# Copy python files. -COPY ./python /python - - +RUN rm -rf /bootstrap-dev USER airflow diff --git a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged index 84f8a86..16b2123 100644 --- a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged @@ -3,119 +3,10 @@ # the Jinja2-templated Dockerfile.j2 file, so you need to change that file # instead. # -# This file was generated on 2024-01-30 16:34:46.236417 +# This file was generated on 2024-01-31 02:49:24.662523 # -FROM public.ecr.aws/amazonlinux/amazonlinux:2023 - -# Environment variables -ENV AIRFLOW_AMAZON_PROVIDERS_VERSION=8.13.0 -ENV AIRFLOW_USER_HOME=/usr/local/airflow -ENV AIRFLOW_HOME=${AIRFLOW_USER_HOME} -ENV AIRFLOW_VERSION=2.8.0 -ENV MWAA_HOME=/usr/local/mwaa -ENV PYTHON_VERSION=3.11.7 - -# We don't want those variables to stay in the final image, so we use ARG instead of ENV. -ARG PATH_DEFAULT=${PATH} -ARG PATH_AIRFLOW_USER=${AIRFLOW_USER_HOME}/.local/bin:${PATH_DEFAULT} -ARG PYTHON_MD5_CHECKSUM=d96c7e134c35a8c46236f8a0e566b69c -ARG MARIADB_DOWNLOAD_BASE_URL=https://mirror.mariadb.org/yum/11.1/fedora38-amd64/rpms -ARG MARIADB_RPM_COMMON=MariaDB-common-11.1.2-1.fc38.x86_64.rpm -ARG MARIADB_RPM_COMMON_CHECKSUM=e87371d558efa97724f3728fb214cf19 -ARG MARIADB_RPM_DEVEL=MariaDB-devel-11.1.2-1.fc38.x86_64.rpm -ARG MARIADB_RPM_DEVEL_CHECKSUM=cfce6e9b53f4e4fb1cb14f1ed720c92c -ARG MARIADB_RPM_SHARED=MariaDB-shared-11.1.2-1.fc38.x86_64.rpm -ARG MARIADB_RPM_SHARED_CHECKSUM=ed82ad5bc5b35cb2719a9471a71c6cdb - -# Copy bootstrapping files. -COPY ./bootstrap /bootstrap -RUN chmod -R +x /bootstrap - -#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> -# BEGINNING marker for root user, first pass bootstrapping steps. -#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - - -RUN /bootstrap/01-root-firstpass/001-init.sh - -RUN /bootstrap/01-root-firstpass/002-install-python.sh - -RUN /bootstrap/01-root-firstpass/003-install-mariadb.sh - -RUN /bootstrap/01-root-firstpass/004-create-airflow-user.sh - -RUN /bootstrap/01-root-firstpass/005-install-aws-cli.sh - -RUN /bootstrap/01-root-firstpass/100-install-needed-dnf-packages.sh - - -#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -# END marker for root user, first pass bootstrapping steps. -#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - - -#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> -# > BEGINNING marker for airflow user bootstrapping steps. -#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - -# Switch to 'airflow' user and update the PATH environment variable. -USER airflow -ENV PATH=${PATH_AIRFLOW_USER} - - -RUN /bootstrap/02-airflow/001-install-airflow.sh - - -# Revert the PATH and user. -ENV PATH=${PATH_DEFAULT} -USER root - -#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -# > END marker for airflow user bootstrapping steps. -#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - - -#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> -# BEGINNING marker for root user, second pass bootstrapping steps. -#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - -# Those steps are also executed as the root user. However, they rely on the -# successfull execution of the airflow user bootstrapping steps. For example, -# giving ownership of the Airflow home user to the 'airflow' user requires the -# the folder to be fully setup first. - - -RUN /bootstrap/03-root-secondpass/001-create-mwaa-dir.sh - -RUN /bootstrap/03-root-secondpass/100-chown-airflow-folder.sh - - -#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -# END marker for root user, second pass bootstrapping steps. -#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - -# remove bootstrapping files. -RUN rm -rf /bootstrap - -# Create a volume for syncing files with the sidecar. The actual folder -# is created by the `001-create-mwaa-dir.sh` script. -VOLUME ["${MWAA_HOME}"] - -# TODO We should only expose this port if the command is 'webserver'. -EXPOSE 8080 - -ENV PATH=${PATH_AIRFLOW_USER} -ENV PYTHONPATH="/python" - -WORKDIR ${AIRFLOW_USER_HOME} - -# Copy python files. -COPY ./python /python - - +FROM amazon-mwaa/airflow:2.8.0-base USER root diff --git a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged-dev b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged-dev index 8079639..ec23f38 100644 --- a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged-dev +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged-dev @@ -3,121 +3,27 @@ # the Jinja2-templated Dockerfile.j2 file, so you need to change that file # instead. # -# This file was generated on 2024-01-30 16:34:46.225951 +# This file was generated on 2024-01-31 02:49:24.654737 # -FROM public.ecr.aws/amazonlinux/amazonlinux:2023 - -# Environment variables -ENV AIRFLOW_AMAZON_PROVIDERS_VERSION=8.13.0 -ENV AIRFLOW_USER_HOME=/usr/local/airflow -ENV AIRFLOW_HOME=${AIRFLOW_USER_HOME} -ENV AIRFLOW_VERSION=2.8.0 -ENV MWAA_HOME=/usr/local/mwaa -ENV PYTHON_VERSION=3.11.7 - -# We don't want those variables to stay in the final image, so we use ARG instead of ENV. -ARG PATH_DEFAULT=${PATH} -ARG PATH_AIRFLOW_USER=${AIRFLOW_USER_HOME}/.local/bin:${PATH_DEFAULT} -ARG PYTHON_MD5_CHECKSUM=d96c7e134c35a8c46236f8a0e566b69c -ARG MARIADB_DOWNLOAD_BASE_URL=https://mirror.mariadb.org/yum/11.1/fedora38-amd64/rpms -ARG MARIADB_RPM_COMMON=MariaDB-common-11.1.2-1.fc38.x86_64.rpm -ARG MARIADB_RPM_COMMON_CHECKSUM=e87371d558efa97724f3728fb214cf19 -ARG MARIADB_RPM_DEVEL=MariaDB-devel-11.1.2-1.fc38.x86_64.rpm -ARG MARIADB_RPM_DEVEL_CHECKSUM=cfce6e9b53f4e4fb1cb14f1ed720c92c -ARG MARIADB_RPM_SHARED=MariaDB-shared-11.1.2-1.fc38.x86_64.rpm -ARG MARIADB_RPM_SHARED_CHECKSUM=ed82ad5bc5b35cb2719a9471a71c6cdb +FROM amazon-mwaa/airflow:2.8.0-base # Copy bootstrapping files. -COPY ./bootstrap /bootstrap -RUN chmod -R +x /bootstrap - -#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> -# BEGINNING marker for root user, first pass bootstrapping steps. -#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - - -RUN /bootstrap/01-root-firstpass/001-init.sh - -RUN /bootstrap/01-root-firstpass/002-install-python.sh - -RUN /bootstrap/01-root-firstpass/003-install-mariadb.sh - -RUN /bootstrap/01-root-firstpass/004-create-airflow-user.sh - -RUN /bootstrap/01-root-firstpass/005-install-aws-cli.sh - -RUN /bootstrap/01-root-firstpass/100-install-needed-dnf-packages.sh - - -#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -# END marker for root user, first pass bootstrapping steps. -#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - - -#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> -# > BEGINNING marker for airflow user bootstrapping steps. -#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - -# Switch to 'airflow' user and update the PATH environment variable. -USER airflow -ENV PATH=${PATH_AIRFLOW_USER} - - -RUN /bootstrap/02-airflow/001-install-airflow.sh - - -# Revert the PATH and user. -ENV PATH=${PATH_DEFAULT} -USER root - -#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -# > END marker for airflow user bootstrapping steps. -#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - +COPY ./bootstrap-dev /bootstrap-dev +RUN chmod -R +x /bootstrap-dev #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> -# BEGINNING marker for root user, second pass bootstrapping steps. +# BEGINNING marker for dev bootstrapping steps. #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> -# Those steps are also executed as the root user. However, they rely on the -# successfull execution of the airflow user bootstrapping steps. For example, -# giving ownership of the Airflow home user to the 'airflow' user requires the -# the folder to be fully setup first. - - -RUN /bootstrap/03-root-secondpass/001-create-mwaa-dir.sh - -RUN /bootstrap/03-root-secondpass/100-chown-airflow-folder.sh - -RUN /bootstrap/03-root-secondpass/200-devonly-install-dev-tools.sh - +RUN /bootstrap-dev/001-devonly-install-dev-tools.sh #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -# END marker for root user, second pass bootstrapping steps. +# END marker for dev bootstrapping steps. #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # remove bootstrapping files. -RUN rm -rf /bootstrap - -# Create a volume for syncing files with the sidecar. The actual folder -# is created by the `001-create-mwaa-dir.sh` script. -VOLUME ["${MWAA_HOME}"] - -# TODO We should only expose this port if the command is 'webserver'. -EXPOSE 8080 - -ENV PATH=${PATH_AIRFLOW_USER} -ENV PYTHONPATH="/python" - -WORKDIR ${AIRFLOW_USER_HOME} - -# Copy python files. -COPY ./python /python - - +RUN rm -rf /bootstrap-dev USER root diff --git a/images/airflow/2.8.0/Dockerfiles/Dockerfile.base b/images/airflow/2.8.0/Dockerfiles/Dockerfile.base new file mode 100644 index 0000000..cc1ca04 --- /dev/null +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile.base @@ -0,0 +1,106 @@ +# +# WARNING: Don't change this file manually. This file is auto-generated from +# the Jinja2-templated Dockerfile.j2 file, so you need to change that file +# instead. +# +# This file was generated on 2024-01-31 02:49:24.646162 +# + +FROM public.ecr.aws/amazonlinux/amazonlinux:2023 + +# Environment variables +ENV AIRFLOW_AMAZON_PROVIDERS_VERSION=8.13.0 +ENV AIRFLOW_USER_HOME=/usr/local/airflow +ENV AIRFLOW_HOME=${AIRFLOW_USER_HOME} +ENV AIRFLOW_VERSION=2.8.0 +ENV MWAA_HOME=/usr/local/mwaa +ENV PYTHON_VERSION=3.11.7 + +# We don't want those variables to stay in the final image, so we use ARG instead of ENV. +ARG PATH_DEFAULT=${PATH} +ARG PATH_AIRFLOW_USER=${AIRFLOW_USER_HOME}/.local/bin:${PATH_DEFAULT} +ARG PYTHON_MD5_CHECKSUM=d96c7e134c35a8c46236f8a0e566b69c +ARG MARIADB_DOWNLOAD_BASE_URL=https://mirror.mariadb.org/yum/11.1/fedora38-amd64/rpms +ARG MARIADB_RPM_COMMON=MariaDB-common-11.1.2-1.fc38.x86_64.rpm +ARG MARIADB_RPM_COMMON_CHECKSUM=e87371d558efa97724f3728fb214cf19 +ARG MARIADB_RPM_DEVEL=MariaDB-devel-11.1.2-1.fc38.x86_64.rpm +ARG MARIADB_RPM_DEVEL_CHECKSUM=cfce6e9b53f4e4fb1cb14f1ed720c92c +ARG MARIADB_RPM_SHARED=MariaDB-shared-11.1.2-1.fc38.x86_64.rpm +ARG MARIADB_RPM_SHARED_CHECKSUM=ed82ad5bc5b35cb2719a9471a71c6cdb + +# Copy bootstrapping files. +COPY ./bootstrap /bootstrap +RUN chmod -R +x /bootstrap + +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +# BEGINNING marker for root user, first pass bootstrapping steps. +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + +RUN /bootstrap/01-root-firstpass/001-init.sh + +RUN /bootstrap/01-root-firstpass/002-install-python.sh + +RUN /bootstrap/01-root-firstpass/003-install-mariadb.sh + +RUN /bootstrap/01-root-firstpass/004-create-airflow-user.sh + +RUN /bootstrap/01-root-firstpass/005-install-aws-cli.sh + +RUN /bootstrap/01-root-firstpass/100-install-needed-dnf-packages.sh + +#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +# END marker for root user, first pass bootstrapping steps. +#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +# > BEGINNING marker for airflow user bootstrapping steps. +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + +# Switch to 'airflow' user and update the PATH environment variable. +USER airflow +ENV PATH=${PATH_AIRFLOW_USER} + +RUN /bootstrap/02-airflow/001-install-airflow.sh + +# Revert the PATH and user. +ENV PATH=${PATH_DEFAULT} +USER root + +#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +# > END marker for airflow user bootstrapping steps. +#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +# BEGINNING marker for root user, second pass bootstrapping steps. +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + +# Those steps are also executed as the root user. However, they rely on the +# successfull execution of the airflow user bootstrapping steps. For example, +# giving ownership of the Airflow home user to the 'airflow' user requires the +# the folder to be fully setup first. + +RUN /bootstrap/03-root-secondpass/001-create-mwaa-dir.sh + +RUN /bootstrap/03-root-secondpass/100-chown-airflow-folder.sh + +#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +# END marker for root user, second pass bootstrapping steps. +#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# remove bootstrapping files. +RUN rm -rf /bootstrap + +# Create a volume for syncing files with the sidecar. The actual folder +# is created by the `001-create-mwaa-dir.sh` script. +VOLUME ["${MWAA_HOME}"] + +# TODO We should only expose this port if the command is 'webserver'. +EXPOSE 8080 + +ENV PATH=${PATH_AIRFLOW_USER} +ENV PYTHONPATH="/python" + +WORKDIR ${AIRFLOW_USER_HOME} + +# Copy python files. +COPY ./python /python \ No newline at end of file diff --git a/images/airflow/2.8.0/bootstrap/03-root-secondpass/200-devonly-install-dev-tools.sh b/images/airflow/2.8.0/bootstrap-dev/001-devonly-install-dev-tools.sh similarity index 100% rename from images/airflow/2.8.0/bootstrap/03-root-secondpass/200-devonly-install-dev-tools.sh rename to images/airflow/2.8.0/bootstrap-dev/001-devonly-install-dev-tools.sh diff --git a/images/airflow/2.8.0/build.sh b/images/airflow/2.8.0/build.sh index b7eee52..c82df21 100755 --- a/images/airflow/2.8.0/build.sh +++ b/images/airflow/2.8.0/build.sh @@ -1,10 +1,13 @@ #!/bin/bash set -e -python3 generate-dockerfile.py +# Generate the Dockerfiles from the templates. +python3 generate-dockerfiles.py -#!/bin/bash +# Build the base image. +docker build -f ./Dockerfiles/Dockerfile.base -t amazon-mwaa/airflow:2.8.0-base ./ +# Build the derivatives. for dev in "True" "False"; do for build_type in "standard" "explorer" "explorer-privileged"; do dockerfile_name="Dockerfile" diff --git a/images/airflow/2.8.0/explore-image.sh b/images/airflow/2.8.0/explore-image.sh new file mode 100755 index 0000000..6e316d3 --- /dev/null +++ b/images/airflow/2.8.0/explore-image.sh @@ -0,0 +1,2 @@ +#!/bin/bash +docker container run -it amazon-mwaa/airflow:2.8.0-explorer-dev diff --git a/images/airflow/2.8.0/generate-dockerfile.py b/images/airflow/2.8.0/generate-dockerfiles.py similarity index 63% rename from images/airflow/2.8.0/generate-dockerfile.py rename to images/airflow/2.8.0/generate-dockerfiles.py index 3d57e87..ad7c421 100644 --- a/images/airflow/2.8.0/generate-dockerfile.py +++ b/images/airflow/2.8.0/generate-dockerfiles.py @@ -52,32 +52,65 @@ def is_dev_bootstrapping_step(bootstrap_filename: str) -> bool: return len(comps) > 1 and comps[1] == 'devonly' -def generate_dockerfile(output_file, build_type='standard', dev=False) -> None: +def remove_repeated_empty_lines(text: str) -> str: """ - Generate a Dockerfile based on the given build arguments. + Removes repeated empty lines from a given text, leaving at most one empty + line between non-empty lines. - :param build_type: Specifies the build type. This can have the following - values: - - standard: This is the standard build type. it is what customer uses. - - explorer: The 'explorer' build type is almost identical to the - 'standard' build type but it doesn't include the entrypoint. This is - useful for debugging purposes to run the image and look around its - content without starting airflow, which might require further setup. - - explorer-root: This is similar to the 'explorer' build type, but - additionally uses the root user, giving the user of this Docker image - elevated permissions. The user can, thus, install packages, remove - packages, or anything else. + :param text: The input text from which repeated empty lines should be + removed. - :param dev: Whether to produce a development image or a production one. - Development images have extra stuff that are useful during development, - e.g. editors, sudo, etc. + :returns: The cleaned text with no more than one consecutive empty line. """ + lines = text.split(os.linesep) # Split the text into lines + previous_line_empty = False # Track if the previous line was empty + cleaned_lines = [] + + for line in lines: + # Check if the current line is empty + if not line.strip(): + if not previous_line_empty: + # If the current line is empty but the previous one wasn't, add + # the empty line + cleaned_lines.append(line) + previous_line_empty = True + else: + # If the current line is not empty, add it and reset the flag + cleaned_lines.append(line) + previous_line_empty = False + + # Join the cleaned lines back into a single string + cleaned_text = os.linesep.join(cleaned_lines) + return cleaned_text + + +def generate_dockerfile(template: str, + output_file: str, + data: dict[str, str]) -> None: # Load Dockerfile Jinja template. file_loader = FileSystemLoader('.') env = Environment(loader=file_loader) env.globals['raise'] = raise_helper - template = env.get_template('Dockerfile.j2') + template = env.get_template(template) + + # Render the template and generate the Dockerfile + output = template.render(data) + with open(os.path.join('./Dockerfiles', output_file), 'w') as f: + f.write(f''' +# +# WARNING: Don't change this file manually. This file is auto-generated from +# the Jinja2-templated Dockerfile.j2 file, so you need to change that file +# instead. +# +# This file was generated on {datetime.now()} +# + '''.strip()) + f.write(os.linesep) + f.write(os.linesep) + f.write(remove_repeated_empty_lines(output)) + +def generate_base_dockerfile() -> None: # Template data data = { 'bootstrapping_scripts_root_firstpass': [ @@ -95,36 +128,61 @@ def generate_dockerfile(output_file, build_type='standard', dev=False) -> None: for name in sorted(os.listdir('./bootstrap/03-root-secondpass')) if not is_dev_bootstrapping_step(name) or dev is True ], - 'build_type': build_type } - # Render the template and generate the Dockerfile - output = template.render(data) - with open(os.path.join('./Dockerfiles', output_file), 'w') as f: - f.write(f''' -# -# WARNING: Don't change this file manually. This file is auto-generated from -# the Jinja2-templated Dockerfile.j2 file, so you need to change that file -# instead. -# -# This file was generated on {datetime.now()} -# - '''.strip()) - f.write(os.linesep) - f.write(os.linesep) - f.write(output) + template_name = 'Dockerfile.base.j2' + dockerfile_name = 'Dockerfile.base' + generate_dockerfile(template_name, dockerfile_name, data) + + +def generate_derivative_dockerfiles(build_type: str = 'standard', + dev: bool = False) -> None: + """ + Generate a Dockerfile based on the given build arguments. + + :param build_type: Specifies the build type. This can have the following + values: + - standard: This is the standard build type. it is what customer uses. + - explorer: The 'explorer' build type is almost identical to the + 'standard' build type but it doesn't include the entrypoint. This is + useful for debugging purposes to run the image and look around its + content without starting airflow, which might require further setup. + - explorer-root: This is similar to the 'explorer' build type, but + additionally uses the root user, giving the user of this Docker image + elevated permissions. The user can, thus, install packages, remove + packages, or anything else. + + :param dev: Whether to produce a development image or a production one. + Development images have extra stuff that are useful during development, + e.g. editors, sudo, etc. + """ + + template_name = 'Dockerfile.derivatives.j2' + dockerfile_name = 'Dockerfile' + if build_type != 'standard': + dockerfile_name = f'{dockerfile_name}-{build_type}' + if dev: + dockerfile_name = f'{dockerfile_name}-dev' + data = { + 'bootstrapping_scripts_dev': [ + os.path.join('/bootstrap-dev', name).strip() + for name in sorted(os.listdir('./bootstrap-dev')) + ] if dev else [], + 'build_type': build_type, + } + + generate_dockerfile(template_name, dockerfile_name, data) if __name__ == '__main__': + # Generate the base Dockerfile file (Dockerfile.base). + generate_base_dockerfile() + + # Generate the derivative Dockerfiles (multiple Dockerfiles based on + # the build arguments.) for dev in [True, False]: for build_type in ['standard', 'explorer', 'explorer-privileged']: - dockerfile_name = 'Dockerfile' - if build_type != 'standard': - dockerfile_name = f'{dockerfile_name}-{build_type}' - if dev: - dockerfile_name = f'{dockerfile_name}-dev' - generate_dockerfile(dockerfile_name, - build_type=build_type, dev=dev) + generate_derivative_dockerfiles(build_type=build_type, dev=dev) else: print('This module cannot be imported.') sys.exit(1) From ca4220b77fbd8138a126699ac71c58d4370dbe50 Mon Sep 17 00:00:00 2001 From: Rafid K Date: Wed, 31 Jan 2024 23:41:08 +0000 Subject: [PATCH 04/11] Implement `safe-pip-install` command More information about this command can be found in #18. --- images/airflow/2.8.0/Dockerfile.base.j2 | 11 ++++++++-- images/airflow/2.8.0/Dockerfiles/Dockerfile | 2 +- .../airflow/2.8.0/Dockerfiles/Dockerfile-dev | 2 +- .../2.8.0/Dockerfiles/Dockerfile-explorer | 2 +- .../2.8.0/Dockerfiles/Dockerfile-explorer-dev | 2 +- .../Dockerfile-explorer-privileged | 2 +- .../Dockerfile-explorer-privileged-dev | 2 +- .../airflow/2.8.0/Dockerfiles/Dockerfile.base | 15 ++++++++++--- .../2.8.0/bin/airflow-user/safe-pip-install | 14 +++++++++++++ .../02-airflow/001-install-airflow.sh | 21 ------------------- .../001-install-required-pip-packages.sh | 9 ++++++++ 11 files changed, 50 insertions(+), 32 deletions(-) create mode 100644 images/airflow/2.8.0/bin/airflow-user/safe-pip-install delete mode 100644 images/airflow/2.8.0/bootstrap/02-airflow/001-install-airflow.sh create mode 100644 images/airflow/2.8.0/bootstrap/02-airflow/001-install-required-pip-packages.sh diff --git a/images/airflow/2.8.0/Dockerfile.base.j2 b/images/airflow/2.8.0/Dockerfile.base.j2 index c8c05b8..5f7f967 100644 --- a/images/airflow/2.8.0/Dockerfile.base.j2 +++ b/images/airflow/2.8.0/Dockerfile.base.j2 @@ -2,6 +2,7 @@ FROM public.ecr.aws/amazonlinux/amazonlinux:2023 # Environment variables ENV AIRFLOW_AMAZON_PROVIDERS_VERSION=8.13.0 +ENV AIRFLOW_CONSTRAINTS_FILE="https://raw.githubusercontent.com/apache/airflow/constraints-2.8.0/constraints-3.11.txt" ENV AIRFLOW_USER_HOME=/usr/local/airflow ENV AIRFLOW_HOME=${AIRFLOW_USER_HOME} ENV AIRFLOW_VERSION=2.8.0 @@ -9,8 +10,10 @@ ENV MWAA_HOME=/usr/local/mwaa ENV PYTHON_VERSION=3.11.7 # We don't want those variables to stay in the final image, so we use ARG instead of ENV. +ARG AIRFLOW_USER_LOCAL_PATH=${AIRFLOW_USER_HOME}/.local +ARG AIRFLOW_USER_LOCAL_BIN_PATH=${AIRFLOW_USER_LOCAL_PATH}/bin ARG PATH_DEFAULT=${PATH} -ARG PATH_AIRFLOW_USER=${AIRFLOW_USER_HOME}/.local/bin:${PATH_DEFAULT} +ARG PATH_AIRFLOW_USER=${AIRFLOW_USER_LOCAL_BIN_PATH}:${PATH_DEFAULT} ARG PYTHON_MD5_CHECKSUM=d96c7e134c35a8c46236f8a0e566b69c ARG MARIADB_DOWNLOAD_BASE_URL=https://mirror.mariadb.org/yum/11.1/fedora38-amd64/rpms ARG MARIADB_RPM_COMMON=MariaDB-common-11.1.2-1.fc38.x86_64.rpm @@ -36,7 +39,11 @@ RUN {{ filepath }} # END marker for root user, first pass bootstrapping steps. #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - +# Copy airflow user's bin/ directory before starting the bootstrapping steps for +# airflow user. +COPY ./bin/airflow-user $AIRFLOW_USER_LOCAL_BIN_PATH +RUN chmod -R +x ${AIRFLOW_USER_LOCAL_BIN_PATH}/* && \ + chown -R airflow: ${AIRFLOW_USER_LOCAL_PATH} #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> # > BEGINNING marker for airflow user bootstrapping steps. diff --git a/images/airflow/2.8.0/Dockerfiles/Dockerfile b/images/airflow/2.8.0/Dockerfiles/Dockerfile index 82d51aa..12112d5 100644 --- a/images/airflow/2.8.0/Dockerfiles/Dockerfile +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile @@ -3,7 +3,7 @@ # the Jinja2-templated Dockerfile.j2 file, so you need to change that file # instead. # -# This file was generated on 2024-01-31 02:49:24.657364 +# This file was generated on 2024-01-31 20:02:12.342796 # FROM amazon-mwaa/airflow:2.8.0-base diff --git a/images/airflow/2.8.0/Dockerfiles/Dockerfile-dev b/images/airflow/2.8.0/Dockerfiles/Dockerfile-dev index 6a03ddb..4afee30 100644 --- a/images/airflow/2.8.0/Dockerfiles/Dockerfile-dev +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile-dev @@ -3,7 +3,7 @@ # the Jinja2-templated Dockerfile.j2 file, so you need to change that file # instead. # -# This file was generated on 2024-01-31 02:49:24.649403 +# This file was generated on 2024-01-31 20:02:12.334771 # FROM amazon-mwaa/airflow:2.8.0-base diff --git a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer index a19c24e..702f5e2 100644 --- a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer @@ -3,7 +3,7 @@ # the Jinja2-templated Dockerfile.j2 file, so you need to change that file # instead. # -# This file was generated on 2024-01-31 02:49:24.659949 +# This file was generated on 2024-01-31 20:02:12.345378 # FROM amazon-mwaa/airflow:2.8.0-base diff --git a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-dev b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-dev index e5475cc..6f326d8 100644 --- a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-dev +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-dev @@ -3,7 +3,7 @@ # the Jinja2-templated Dockerfile.j2 file, so you need to change that file # instead. # -# This file was generated on 2024-01-31 02:49:24.652091 +# This file was generated on 2024-01-31 20:02:12.337537 # FROM amazon-mwaa/airflow:2.8.0-base diff --git a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged index 16b2123..f8b6b1e 100644 --- a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged @@ -3,7 +3,7 @@ # the Jinja2-templated Dockerfile.j2 file, so you need to change that file # instead. # -# This file was generated on 2024-01-31 02:49:24.662523 +# This file was generated on 2024-01-31 20:02:12.348042 # FROM amazon-mwaa/airflow:2.8.0-base diff --git a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged-dev b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged-dev index ec23f38..9b6022e 100644 --- a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged-dev +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged-dev @@ -3,7 +3,7 @@ # the Jinja2-templated Dockerfile.j2 file, so you need to change that file # instead. # -# This file was generated on 2024-01-31 02:49:24.654737 +# This file was generated on 2024-01-31 20:02:12.340210 # FROM amazon-mwaa/airflow:2.8.0-base diff --git a/images/airflow/2.8.0/Dockerfiles/Dockerfile.base b/images/airflow/2.8.0/Dockerfiles/Dockerfile.base index cc1ca04..6a3f3d4 100644 --- a/images/airflow/2.8.0/Dockerfiles/Dockerfile.base +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile.base @@ -3,13 +3,14 @@ # the Jinja2-templated Dockerfile.j2 file, so you need to change that file # instead. # -# This file was generated on 2024-01-31 02:49:24.646162 +# This file was generated on 2024-01-31 20:02:12.330369 # FROM public.ecr.aws/amazonlinux/amazonlinux:2023 # Environment variables ENV AIRFLOW_AMAZON_PROVIDERS_VERSION=8.13.0 +ENV AIRFLOW_CONSTRAINTS_FILE="https://raw.githubusercontent.com/apache/airflow/constraints-2.8.0/constraints-3.11.txt" ENV AIRFLOW_USER_HOME=/usr/local/airflow ENV AIRFLOW_HOME=${AIRFLOW_USER_HOME} ENV AIRFLOW_VERSION=2.8.0 @@ -17,8 +18,10 @@ ENV MWAA_HOME=/usr/local/mwaa ENV PYTHON_VERSION=3.11.7 # We don't want those variables to stay in the final image, so we use ARG instead of ENV. +ARG AIRFLOW_USER_LOCAL_PATH=${AIRFLOW_USER_HOME}/.local +ARG AIRFLOW_USER_LOCAL_BIN_PATH=${AIRFLOW_USER_LOCAL_PATH}/bin ARG PATH_DEFAULT=${PATH} -ARG PATH_AIRFLOW_USER=${AIRFLOW_USER_HOME}/.local/bin:${PATH_DEFAULT} +ARG PATH_AIRFLOW_USER=${AIRFLOW_USER_LOCAL_BIN_PATH}:${PATH_DEFAULT} ARG PYTHON_MD5_CHECKSUM=d96c7e134c35a8c46236f8a0e566b69c ARG MARIADB_DOWNLOAD_BASE_URL=https://mirror.mariadb.org/yum/11.1/fedora38-amd64/rpms ARG MARIADB_RPM_COMMON=MariaDB-common-11.1.2-1.fc38.x86_64.rpm @@ -52,6 +55,12 @@ RUN /bootstrap/01-root-firstpass/100-install-needed-dnf-packages.sh # END marker for root user, first pass bootstrapping steps. #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +# Copy airflow user's bin/ directory before starting the bootstrapping steps for +# airflow user. +COPY ./bin/airflow-user $AIRFLOW_USER_LOCAL_BIN_PATH +RUN chmod -R +x ${AIRFLOW_USER_LOCAL_BIN_PATH}/* && \ + chown -R airflow: ${AIRFLOW_USER_LOCAL_PATH} + #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> # > BEGINNING marker for airflow user bootstrapping steps. #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> @@ -60,7 +69,7 @@ RUN /bootstrap/01-root-firstpass/100-install-needed-dnf-packages.sh USER airflow ENV PATH=${PATH_AIRFLOW_USER} -RUN /bootstrap/02-airflow/001-install-airflow.sh +RUN /bootstrap/02-airflow/001-install-required-pip-packages.sh # Revert the PATH and user. ENV PATH=${PATH_DEFAULT} diff --git a/images/airflow/2.8.0/bin/airflow-user/safe-pip-install b/images/airflow/2.8.0/bin/airflow-user/safe-pip-install new file mode 100644 index 0000000..7fb324c --- /dev/null +++ b/images/airflow/2.8.0/bin/airflow-user/safe-pip-install @@ -0,0 +1,14 @@ +#!/bin/bash + +# Define an array of required packages +REQUIRED_PACKAGES=( + "apache-airflow-providers-amazon[aiobotocore]==${AIRFLOW_AMAZON_PROVIDERS_VERSION}" + "apache-airflow[celery,statsd]==${AIRFLOW_VERSION}" + "celery[sqs]" + psycopg2 + pycurl + watchtower +) + +# Install all required packages in one command +pip3 install --constraint "${AIRFLOW_CONSTRAINTS_FILE}" "${REQUIRED_PACKAGES[@]}" "$@" diff --git a/images/airflow/2.8.0/bootstrap/02-airflow/001-install-airflow.sh b/images/airflow/2.8.0/bootstrap/02-airflow/001-install-airflow.sh deleted file mode 100644 index 3509998..0000000 --- a/images/airflow/2.8.0/bootstrap/02-airflow/001-install-airflow.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -set -e - -# shellcheck source=images/airflow/2.8.0/bootstrap/common.sh -source /bootstrap/common.sh - -verify_env_vars_exist \ - AIRFLOW_AMAZON_PROVIDERS_VERSION \ - AIRFLOW_VERSION \ - PYTHON_VERSION - -PYTHON_MAJOR_MINOR_VERSION=${PYTHON_VERSION%.*} - -CONSTRAINT_FILE="https://raw.githubusercontent.com/apache/airflow/constraints-${AIRFLOW_VERSION}/constraints-${PYTHON_MAJOR_MINOR_VERSION}.txt" -pip3 install --constraint "${CONSTRAINT_FILE}" \ - pycurl \ - psycopg2 \ - "celery[sqs]" \ - "apache-airflow[celery,statsd]==${AIRFLOW_VERSION}" \ - "apache-airflow-providers-amazon[aiobotocore]==${AIRFLOW_AMAZON_PROVIDERS_VERSION}" \ - watchtower diff --git a/images/airflow/2.8.0/bootstrap/02-airflow/001-install-required-pip-packages.sh b/images/airflow/2.8.0/bootstrap/02-airflow/001-install-required-pip-packages.sh new file mode 100644 index 0000000..89ae9cd --- /dev/null +++ b/images/airflow/2.8.0/bootstrap/02-airflow/001-install-required-pip-packages.sh @@ -0,0 +1,9 @@ +#!/bin/bash +set -e + +# shellcheck source=images/airflow/2.8.0/bootstrap/common.sh +source /bootstrap/common.sh + +# safe-pip-install always install all required packages, along with whatever +# the user provides, hence we don't need to provide anything here. +safe-pip-install From 3db18f3feff4892f140e687687c2b2b19c8138eb Mon Sep 17 00:00:00 2001 From: Rafid K Date: Thu, 1 Feb 2024 04:51:39 +0000 Subject: [PATCH 05/11] Disallow the direct use of `pip install` To make sure that developers don't accidentally use `pip install` directly, I implemented a script that scans the whole repository for this and report an error if it finds any such case. While at this, I also introduced the `quality-checks` folder, which contains all scripts for ensuring the quality of the repository. I moved `lint_bash.sh` and `lint_python.sh` and I put the new script, `pip_install_check.py` under it. This way we have a central place for all such quality check scripts, which are only expected to multiple in number as the repository good bigger, more contributors are involved, and more quality control is required. The new `quality-checks` folder also contains a script, `run_all.py` that walk through the `quality-checks` directory and execute any executable script. Accordingly, I also updated the GitHub workflows and pre-commit configuration to use the `run_all.py` script instead of manually listing all quality check scripts. --- .github/workflows/bash-lint.yaml | 18 ------ .github/workflows/python-lint.yaml | 23 -------- .github/workflows/quality-checks.yaml | 23 ++++++++ .pre-commit-config.yaml | 12 +--- lint_bash.sh | 10 ---- lint_python.sh | 10 ---- quality-checks/README.md | 29 +++++++++ quality-checks/lint_bash.sh | 19 ++++++ quality-checks/lint_python.sh | 19 ++++++ quality-checks/pip_install_check.py | 85 +++++++++++++++++++++++++++ quality-checks/run_all.py | 76 ++++++++++++++++++++++++ 11 files changed, 254 insertions(+), 70 deletions(-) delete mode 100644 .github/workflows/bash-lint.yaml delete mode 100644 .github/workflows/python-lint.yaml create mode 100644 .github/workflows/quality-checks.yaml delete mode 100755 lint_bash.sh delete mode 100755 lint_python.sh create mode 100644 quality-checks/README.md create mode 100755 quality-checks/lint_bash.sh create mode 100755 quality-checks/lint_python.sh create mode 100755 quality-checks/pip_install_check.py create mode 100755 quality-checks/run_all.py diff --git a/.github/workflows/bash-lint.yaml b/.github/workflows/bash-lint.yaml deleted file mode 100644 index 707f1a9..0000000 --- a/.github/workflows/bash-lint.yaml +++ /dev/null @@ -1,18 +0,0 @@ -name: Lint Bash Scripts - -on: [push, pull_request] - -jobs: - shellcheck: - name: ShellCheck - runs-on: ubuntu-latest - - steps: - - name: Check out code - uses: actions/checkout@v2 - - - name: Install ShellCheck - run: sudo apt install shellcheck - - - name: Run ShellCheck - run: ./lint_bash.sh diff --git a/.github/workflows/python-lint.yaml b/.github/workflows/python-lint.yaml deleted file mode 100644 index b2a12f8..0000000 --- a/.github/workflows/python-lint.yaml +++ /dev/null @@ -1,23 +0,0 @@ -name: Lint Python Scripts - -on: [push, pull_request] - -jobs: - flake8: - name: Flake8 - runs-on: ubuntu-latest - - steps: - - name: Check out code - uses: actions/checkout@v2 - - - name: Set up Python - uses: actions/setup-python@v2 - with: - python-version: "3.12" - - - name: Install Flake8 - run: pip install flake8 - - - name: Run Flake8 - run: ./lint_python.sh diff --git a/.github/workflows/quality-checks.yaml b/.github/workflows/quality-checks.yaml new file mode 100644 index 0000000..c4cb46a --- /dev/null +++ b/.github/workflows/quality-checks.yaml @@ -0,0 +1,23 @@ +name: Quality Checks Workflow + +on: [push, pull_request] + +jobs: + quality_checks: + name: Quality Checks Job + runs-on: ubuntu-latest + + steps: + - name: Check out code... + uses: actions/checkout@v2 + + - name: Set up Python... + uses: actions/setup-python@v2 + with: + python-version: "3.12" + + - name: Install required Python packages... + run: pip install flake8 + + - name: Run quality checks... + run: ./quality-checks/run_all.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6eb4dce..07fadc3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,14 +7,8 @@ repos: - repo: local hooks: - - id: lint-bash - name: Lint Bash Scripts - entry: ./lint_bash.sh + - id: quality-checks + name: Code Quality Checks + entry: ./quality-checks/run_all.py language: script types: [shell] - - - id: lint-python - name: Lint Python Scripts - entry: ./lint_python.sh - language: script - types: [python] diff --git a/lint_bash.sh b/lint_bash.sh deleted file mode 100755 index 916f217..0000000 --- a/lint_bash.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -# Lint all Bash files -echo "Running ShellCheck on Bash scripts..." -if ! find . -type f -name "*.sh" -exec shellcheck {} +; then - echo "ShellCheck linting failed." - exit 1 -else - echo "ShellCheck linting passed." -fi diff --git a/lint_python.sh b/lint_python.sh deleted file mode 100755 index 4e514d0..0000000 --- a/lint_python.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -# Lint all Python files -echo "Running Flake8 on Python files..." -if ! flake8 .; then - echo "Flake8 linting failed." - exit 1 -else - echo "Flake8 linting passed." -fi diff --git a/quality-checks/README.md b/quality-checks/README.md new file mode 100644 index 0000000..9d17393 --- /dev/null +++ b/quality-checks/README.md @@ -0,0 +1,29 @@ +# Quality Checks + +## Overview + +This `quality-checks` folder contains a collection of scripts designed to ensure +the quality and integrity of the repository code. These scripts automate various +checks and audits, helping maintain high standards in code development and +repository maintenance. + +## Contents + +- `lint_bash.sh`: A script for linting bash scripts in the repository. It helps + in identifying and fixing potential issues, ensuring that the bash scripts + adhere to best coding practices. + +- `lint_python.sh`: This script is used for linting Python code. It checks + Python scripts for stylistic errors and coding standards, ensuring consistency + and quality in the Python codebase. + +- `pip_install_check.py`: This script searches through bash scripts in a + specified directory to find instances of `pip install`. Its purpose is to + enforce the use of a special command we provide called `safe-pip-install` for + installing pip packages, which meets certain criteria for compatibility. + +## Usage + +The easiest way to run all checks is to run the `run_all.py` script. If you would +like to execute a specific check, you can manually execute the corresponding script, +e.g. `./lint_bash.sh`. diff --git a/quality-checks/lint_bash.sh b/quality-checks/lint_bash.sh new file mode 100755 index 0000000..75b0278 --- /dev/null +++ b/quality-checks/lint_bash.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +# Ensure the script is being executed while being in the repo root. +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(dirname "$SCRIPT_DIR")" +if [[ "$PWD" != "$REPO_ROOT" ]]; then + SCRIPT_NAME=$(basename "$0") + echo "The script must be run from the repo root. Please cd into the repo root directory and type: ./quality-checks/${SCRIPT_NAME}" + exit 1 +fi + +# Lint all Bash files +echo "Running ShellCheck on Bash scripts..." +if ! find . -type f -name "*.sh" -exec shellcheck {} +; then + echo "ShellCheck linting failed." + exit 1 +else + echo "ShellCheck linting passed." +fi diff --git a/quality-checks/lint_python.sh b/quality-checks/lint_python.sh new file mode 100755 index 0000000..0572d5a --- /dev/null +++ b/quality-checks/lint_python.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +# Ensure the script is being executed while being in the repo root. +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(dirname "$SCRIPT_DIR")" +if [[ "$PWD" != "$REPO_ROOT" ]]; then + SCRIPT_NAME=$(basename "$0") + echo "The script must be run from the repo root. Please cd into the repo root directory and type: ./quality-checks/${SCRIPT_NAME}" + exit 1 +fi + +# Lint all Python files +echo "Running Flake8 on Python files..." +if ! flake8 .; then + echo "Flake8 linting failed." + exit 1 +else + echo "Flake8 linting passed." +fi diff --git a/quality-checks/pip_install_check.py b/quality-checks/pip_install_check.py new file mode 100755 index 0000000..7f0dfa9 --- /dev/null +++ b/quality-checks/pip_install_check.py @@ -0,0 +1,85 @@ +#!/bin/python3 +import os +import sys + + +EMJOI_CHECK_MARK_BUTTON = '\u2705' +EMJOI_CROSS_MARK = '\u274C' + + +def check_file_for_pip_install(filepath: str) -> bool: + """ + Checks if the file contains 'pip install'. + + :param filepath: The path of the file to check. + + :returns True if the check passes (no 'pip install' found), else False. + """ + with open(filepath, 'r') as file: + for line in file: + if 'pip install' in line: + return False + return True + + +def verify_no_pip_install(directory: str) -> bool: + """ + Recursively searches through the directory tree and verifies that there + are no direct use of `pip install`. + + :param directory: The directory to scan. + + :returns True if the verification succeeds, otherwise False. + """ + # Check if the directory exists + if not os.path.isdir(directory): + print(f"The directory {directory} does not exist.") + return + + # Walk through the directory tree + ret_code = True + for root, dirs, files in os.walk(directory): + for filename in files: + if filename.endswith('.sh'): # Check for bash scripts + filepath = os.path.join(root, filename) + if check_file_for_pip_install(filepath): + print(f"{EMJOI_CHECK_MARK_BUTTON} {filepath}") + else: + print(f"{EMJOI_CROSS_MARK} {filepath}.") + ret_code = False + + return ret_code + + +def verify_in_repo_root() -> None: + """ + Verifies that the script is being executed from within the repository + root. Exits with non-zero code if that's not the case. + """ + # Determine the script's directory and the parent directory (which should + # be ) + script_dir = os.path.dirname(os.path.realpath(__file__)) + repo_root = os.path.abspath(os.path.join(script_dir, '..')) + + # Check if the current working directory is the repo root + if os.getcwd() != repo_root: + print("The script must be run from the repo root. Please cd into " + + "the repo root directory and then type: " + + f"./quality-checks/{os.path.basename(__file__)}.") + sys.exit(1) + + +def main() -> None: + verify_in_repo_root() + + if verify_no_pip_install('./'): + sys.exit(0) + else: + print("Some files failed the check. Please ensure you are using " + + "`safe-pip-install` in those files instead of directly " + + "calling `pip install`.") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/quality-checks/run_all.py b/quality-checks/run_all.py new file mode 100755 index 0000000..b1bb58d --- /dev/null +++ b/quality-checks/run_all.py @@ -0,0 +1,76 @@ +#!/bin/python3 +import os +import subprocess +import sys + + +def prefix_output(file, process): + """ + Prefix each line of output with the filename. + """ + for line in process.stdout: + print(f"[{file}] {line.decode().strip()}") + + +def verify_in_repo_root() -> None: + """ + Verifies that the script is being executed from within the repository + root. Exits with non-zero code if that's not the case. + """ + # Determine the script's directory and the parent directory (which should + # be ) + script_dir = os.path.dirname(os.path.realpath(__file__)) + repo_root = os.path.abspath(os.path.join(script_dir, '..')) + + # Check if the current working directory is the repo root + if os.getcwd() != repo_root: + print("The script must be run from the repo root. Please cd into " + + "the repo root directory and then type: " + + f"./quality-checks/{os.path.basename(__file__)}.") + sys.exit(1) + + +def main() -> None: + """ + Script entrypoint. + """ + verify_in_repo_root() + + quality_checks_dir = './quality-checks/' + failed_scripts = [] + + # Iterate over every file in the quality-checks directory + for file in os.listdir(quality_checks_dir): + filepath = os.path.join(quality_checks_dir, file) + + # Exclude README.md and run_all.sh + if file in ["README.md", "run_all.py"]: + continue + + # Check if the file is executable + if os.access(filepath, os.X_OK): + print(f"Executing: {file}") + with subprocess.Popen(filepath, + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) as process: + prefix_output(file, process) + process.wait() # Wait for the process to complete + + # Check the exit status of the script + if process.returncode != 0: + print(f"Script {file} failed with exit status " + + f"{process.returncode}.") + failed_scripts.append(file) + print() + + # Exit with a non-zero status if any script failed + if failed_scripts: + print('The following scripts failed:') + for fs in failed_scripts: + print(f'- {fs}') + sys.exit(1) + + +if __name__ == "__main__": + main() From 1eb5f834a487e1ef5602085563c618a915bd9f81 Mon Sep 17 00:00:00 2001 From: Rafid K Date: Sun, 4 Feb 2024 18:55:11 +0000 Subject: [PATCH 06/11] Improve development experience - Now using VSCode workspace. Not only does this improve the repo navigation, but also allow using multiple Python interpreters, which is required since we use different Python requirements for the repo code vs the Docker images code. - Use Pyright for type checking. - Use ruff for Python linting. --- .code-workspace | 13 ++ .github/workflows/quality-checks.yaml | 39 ++++- .gitignore | 2 + .pre-commit-config.yaml | 6 - .vscode/settings.json | 21 +++ create_venvs.py | 68 ++++++++ images/airflow/2.8.0/.vscode/settings.json | 21 +++ .../2.8.0/bin/airflow-user/safe-pip-install | 1 + images/airflow/2.8.0/dags/hello_world.py | 7 +- images/airflow/2.8.0/pyrightconfig.json | 8 + .../2.8.0/python/mwaa/config/airflow.py | 17 +- .../airflow/2.8.0/python/mwaa/config/aws.py | 5 +- .../2.8.0/python/mwaa/config/celery.py | 15 +- .../2.8.0/python/mwaa/config/database.py | 13 +- .../airflow/2.8.0/python/mwaa/config/sqs.py | 38 ++--- .../airflow/2.8.0/python/mwaa/entrypoint.py | 160 ++++++++++-------- images/airflow/2.8.0/requirements.txt | 6 + .../{2.8.0 => }/generate-dockerfiles.py | 143 ++++++++-------- pyrightconfig.json | 14 ++ quality-checks/lint_python.sh | 41 ++++- quality-checks/pip_install_check.py | 34 ++-- quality-checks/run_all.py | 45 +++-- requirements.txt | 12 ++ 23 files changed, 492 insertions(+), 237 deletions(-) create mode 100644 .code-workspace create mode 100644 .gitignore create mode 100644 .vscode/settings.json create mode 100755 create_venvs.py create mode 100644 images/airflow/2.8.0/.vscode/settings.json create mode 100644 images/airflow/2.8.0/pyrightconfig.json create mode 100644 images/airflow/2.8.0/requirements.txt rename images/airflow/{2.8.0 => }/generate-dockerfiles.py (55%) create mode 100644 pyrightconfig.json create mode 100644 requirements.txt diff --git a/.code-workspace b/.code-workspace new file mode 100644 index 0000000..22c6e2c --- /dev/null +++ b/.code-workspace @@ -0,0 +1,13 @@ +{ + "folders": [ + { + "name": "amazon-mwaa-docker-images", + "path": "./" + }, + { + "name": "airflow-2.8.0", + "path": "./images/airflow/2.8.0" + } + ], + "settings": {} +} diff --git a/.github/workflows/quality-checks.yaml b/.github/workflows/quality-checks.yaml index c4cb46a..35b721c 100644 --- a/.github/workflows/quality-checks.yaml +++ b/.github/workflows/quality-checks.yaml @@ -7,17 +7,40 @@ jobs: name: Quality Checks Job runs-on: ubuntu-latest + container: + image: public.ecr.aws/amazonlinux/amazonlinux:2023 + steps: + - name: Install required packages... + run: | + # Update packages and install required dependencies: + # gcc, libcurl-devel: For compiling pycurl (required for our Airflow setup.) + # gzip: Requiring by actions/checkout@v2 to gunzip the source code. + # postgresql-devel: Required for our Airflow setup. + # python3-devel: Required for building some Python modules, e.g. pycurl. + # python3: Self explanatory. + # tar, wget, xz: For downloading and extracting ShellCheck + dnf update -y + dnf install -y \ + gcc \ + gzip \ + libcurl-devel \ + postgresql-devel \ + python3 \ + python3-devel \ + tar \ + wget \ + xz + # Download and install shellcheck for linting shell scripts + wget https://github.com/koalaman/shellcheck/releases/download/stable/shellcheck-stable.linux.x86_64.tar.xz + tar -xvf shellcheck-stable.linux.x86_64.tar.xz + cp shellcheck-stable/shellcheck /usr/bin/ + - name: Check out code... uses: actions/checkout@v2 - - name: Set up Python... - uses: actions/setup-python@v2 - with: - python-version: "3.12" - - - name: Install required Python packages... - run: pip install flake8 + - name: Create the necessary Python virtual environments... + run: python3 ./create_venvs.py - name: Run quality checks... - run: ./quality-checks/run_all.py + run: python3 ./quality-checks/run_all.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a415184 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +venv +.venv diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 07fadc3..d767bd9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,10 +1,4 @@ repos: - - repo: https://github.com/hhatto/autopep8 - rev: v2.0.4 - hooks: - - id: autopep8 - args: ["--in-place", "--aggressive", "--aggressive"] - - repo: local hooks: - id: quality-checks diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..6385e6c --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,21 @@ +{ + "files.exclude": { + "**/.DS_Store": true, + "**/.conda": true, + "**/.git": true, + "**/.ruff_cache": true, + "**/.venv": true, + "**/Thumbs.db": true, + "**/venv": true + }, + "search.exclude": { + "**/.DS_Store": true, + "**/.conda": true, + "**/.git": true, + "**/.ruff_cache": true, + "**/.venv": true, + "**/Thumbs.db": true, + "**/venv": true + }, + "python.defaultInterpreterPath": "./venv/bin/python" +} diff --git a/create_venvs.py b/create_venvs.py new file mode 100755 index 0000000..1adf9ed --- /dev/null +++ b/create_venvs.py @@ -0,0 +1,68 @@ +import os +import subprocess +import sys +import venv +from pathlib import Path + + +def verify_python_version(): + """Check if the current Python version is at least 3.9.""" + _major, minor, *_ = sys.version_info + if minor < 9: + print("Python 3.9 or higher is required.") + sys.exit(1) + + +def create_venv(path: Path): + """Create a virtual environment in the given directory and install + requirements if `requirements.txt` is present. + + :param dir_path: The path to create the venv in.""" + venv_path = path / ".venv" + + if not venv_path.exists(): + print(f"Creating virtualenv in directory: {path}") + venv.create(venv_path, with_pip=True) + else: + print(f"Virtualenv already exists in {venv_path}") + + requirements_path = path / "requirements.txt" + pip_install(venv_path, requirements_path) + + +def pip_install(venv_dir: Path, requirements_file: Path): + """Install dependencies from requirements.txt if it exists. + + :param venv_dir: The path to the venv directory. + :param venv_dir: The path to the requirements.txt file.""" + if os.path.exists(requirements_file): + print(f"Installing dependencies from {requirements_file}...") + subprocess.run( + [ + os.path.join(venv_dir, "bin", "python"), + "-m", + "pip", + "install", + "-U", + "-r", + str(requirements_file), + "pip", # Upgrade pip as well. + ], + check=True, + ) + + +def main(): + """Main entrypoint of the script.""" + verify_python_version() + project_dirs = [ + Path("."), + *Path("./images").glob("airflow/*"), + ] # Include main project dir and each image dir + for dir_path in project_dirs: + if dir_path.is_dir() and (dir_path / "requirements.txt").exists(): + create_venv(dir_path) + + +if __name__ == "__main__": + main() diff --git a/images/airflow/2.8.0/.vscode/settings.json b/images/airflow/2.8.0/.vscode/settings.json new file mode 100644 index 0000000..6385e6c --- /dev/null +++ b/images/airflow/2.8.0/.vscode/settings.json @@ -0,0 +1,21 @@ +{ + "files.exclude": { + "**/.DS_Store": true, + "**/.conda": true, + "**/.git": true, + "**/.ruff_cache": true, + "**/.venv": true, + "**/Thumbs.db": true, + "**/venv": true + }, + "search.exclude": { + "**/.DS_Store": true, + "**/.conda": true, + "**/.git": true, + "**/.ruff_cache": true, + "**/.venv": true, + "**/Thumbs.db": true, + "**/venv": true + }, + "python.defaultInterpreterPath": "./venv/bin/python" +} diff --git a/images/airflow/2.8.0/bin/airflow-user/safe-pip-install b/images/airflow/2.8.0/bin/airflow-user/safe-pip-install index 7fb324c..c088913 100644 --- a/images/airflow/2.8.0/bin/airflow-user/safe-pip-install +++ b/images/airflow/2.8.0/bin/airflow-user/safe-pip-install @@ -1,6 +1,7 @@ #!/bin/bash # Define an array of required packages +# TODO Remove this and use requirements.txt. REQUIRED_PACKAGES=( "apache-airflow-providers-amazon[aiobotocore]==${AIRFLOW_AMAZON_PROVIDERS_VERSION}" "apache-airflow[celery,statsd]==${AIRFLOW_VERSION}" diff --git a/images/airflow/2.8.0/dags/hello_world.py b/images/airflow/2.8.0/dags/hello_world.py index cbd8a51..fd74fa5 100644 --- a/images/airflow/2.8.0/dags/hello_world.py +++ b/images/airflow/2.8.0/dags/hello_world.py @@ -1,3 +1,5 @@ +"""A sample DAG.""" + # Python imports from datetime import datetime, timedelta @@ -6,7 +8,7 @@ from airflow.decorators import task with DAG( - dag_id='hello_world_dag', + dag_id="hello_world_dag", schedule_interval=timedelta(minutes=1), dagrun_timeout=timedelta(minutes=5), start_date=datetime(2024, 1, 1), @@ -15,7 +17,8 @@ ) as dag: @task(task_id="print_task") - def hello_world(): + def hello_world() -> None: + """print_task prints a Hello World message.""" print("Hello, World!") hello_world() diff --git a/images/airflow/2.8.0/pyrightconfig.json b/images/airflow/2.8.0/pyrightconfig.json new file mode 100644 index 0000000..2fcc260 --- /dev/null +++ b/images/airflow/2.8.0/pyrightconfig.json @@ -0,0 +1,8 @@ +{ + "include": ["./"], + "exclude": ["**/__pycache__", "**/.venv"], + "strict": ["./"], + "pythonVersion": "3.11", + "pythonPlatform": "All", + "typeCheckingMode": "strict" +} diff --git a/images/airflow/2.8.0/python/mwaa/config/airflow.py b/images/airflow/2.8.0/python/mwaa/config/airflow.py index 5a76468..934eb10 100644 --- a/images/airflow/2.8.0/python/mwaa/config/airflow.py +++ b/images/airflow/2.8.0/python/mwaa/config/airflow.py @@ -4,7 +4,7 @@ from mwaa.config.sqs import get_sqs_endpoint, get_sqs_queue_name -def get_airflow_db_config() -> Dict: +def get_airflow_db_config() -> Dict[str, str]: """ Retrieves the environment variables required to set the necessary Airflow configurations under the "database" section. @@ -15,18 +15,19 @@ def get_airflow_db_config() -> Dict: } -def get_airflow_celery_config() -> Dict: +def get_airflow_celery_config() -> Dict[str, str]: """ Retrieves the environment variables required to set the necessary Airflow configurations for using Celery (mostly under the "celery" section, but other sections as well.) """ + + celery_config_module_path = "mwaa.config.celery.MWAA_CELERY_CONFIG" + return { - "AIRFLOW__CELERY_BROKER_TRANSPORT_OPTIONS__VISIBILITY_TIMEOUT": - "43200", + "AIRFLOW__CELERY_BROKER_TRANSPORT_OPTIONS__VISIBILITY_TIMEOUT": "43200", "AIRFLOW__CELERY__BROKER_URL": get_sqs_endpoint(), - "AIRFLOW__CELERY__CELERY_CONFIG_OPTIONS": - "mwaa.config.celery.MWAA_CELERY_CONFIG", + "AIRFLOW__CELERY__CELERY_CONFIG_OPTIONS": celery_config_module_path, "AIRFLOW__CELERY__RESULT_BACKEND": f"db+{get_db_connection_string()}", "AIRFLOW__CELERY__WORKER_ENABLE_REMOTE_CONTROL": "False", "AIRFLOW__CORE__EXECUTOR": "CeleryExecutor", @@ -35,7 +36,7 @@ def get_airflow_celery_config() -> Dict: } -def get_airflow_core_config() -> Dict: +def get_airflow_core_config() -> Dict[str, str]: """ Retrieves the environment variables required to set the necessary Airflow configurations under the "core" section. @@ -45,7 +46,7 @@ def get_airflow_core_config() -> Dict: } -def get_airflow_config() -> Dict: +def get_airflow_config() -> Dict[str, str]: """ Retrieves the environment variables required to set the necessary Airflow configurations. diff --git a/images/airflow/2.8.0/python/mwaa/config/aws.py b/images/airflow/2.8.0/python/mwaa/config/aws.py index 46586cf..32b5c94 100644 --- a/images/airflow/2.8.0/python/mwaa/config/aws.py +++ b/images/airflow/2.8.0/python/mwaa/config/aws.py @@ -1,14 +1,13 @@ import os -def get_aws_region(): +def get_aws_region() -> str: """ Retrieves the AWS region the container should communicate with. This is assumed to be available in either the AWS_REGION or AWS_DEFAULT_REGION environment variables, checked respectively. """ - region = os.environ.get('AWS_REGION') or \ - os.environ.get('AWS_DEFAULT_REGION') + region = os.environ.get("AWS_REGION") or os.environ.get("AWS_DEFAULT_REGION") if region: return region else: diff --git a/images/airflow/2.8.0/python/mwaa/config/celery.py b/images/airflow/2.8.0/python/mwaa/config/celery.py index 7506387..588c658 100644 --- a/images/airflow/2.8.0/python/mwaa/config/celery.py +++ b/images/airflow/2.8.0/python/mwaa/config/celery.py @@ -1,33 +1,28 @@ # Python imports import copy +from typing import Any # 3rd party imports -from airflow.providers.celery.executors.default_celery import ( - DEFAULT_CELERY_CONFIG -) +from airflow.providers.celery.executors.default_celery import DEFAULT_CELERY_CONFIG # Our import from mwaa.config.aws import get_aws_region from mwaa.config.sqs import get_sqs_queue_name, get_sqs_queue_url -def create_celery_config(): +def create_celery_config() -> dict[str, Any]: """ Generate the configuration that will be passed to Celery. This is used in the "celery" section of the Airflow configuration. """ # We use Airflow's default condfiguration and make the changes we want. - celery_config = copy.deepcopy(DEFAULT_CELERY_CONFIG) + celery_config: dict[str, Any] = copy.deepcopy(DEFAULT_CELERY_CONFIG) celery_config = { **celery_config, "broker_transport_options": { **celery_config["broker_transport_options"], - "predefined_queues": { - get_sqs_queue_name(): { - "url": get_sqs_queue_url() - } - }, + "predefined_queues": {get_sqs_queue_name(): {"url": get_sqs_queue_url()}}, "is_secure": True, "region": get_aws_region(), }, diff --git a/images/airflow/2.8.0/python/mwaa/config/database.py b/images/airflow/2.8.0/python/mwaa/config/database.py index 5e08f6c..f149748 100644 --- a/images/airflow/2.8.0/python/mwaa/config/database.py +++ b/images/airflow/2.8.0/python/mwaa/config/database.py @@ -25,14 +25,15 @@ def get_db_connection_string() -> str: ) = itemgetter(*env_vars_names)(os.environ) except Exception as e: raise RuntimeError( - 'One or more of the required environment variables for ' + - 'configuring Postgres are not set. Please ensure you set all ' + - 'all the following environment variables: ' + - f'{", ".join(env_vars_names)}. This was the result of the ' + - f'following exception: {e}') + "One or more of the required environment variables for " + "configuring Postgres are not set. Please ensure you set all " + "all the following environment variables: " + f'{", ".join(env_vars_names)}. This was the result of the ' + f"following exception: {e}" + ) protocol = "postgresql+psycopg2" creds = f"{postgres_user}:{postgres_password}" addr = f"{postgres_host}:{postgres_port}" # TODO We need to do what is the necessary to enforce 'require'. - return f'{protocol}://{creds}@{addr}/{postgres_db}?sslmode=prefer' + return f"{protocol}://{creds}@{addr}/{postgres_db}?sslmode=prefer" diff --git a/images/airflow/2.8.0/python/mwaa/config/sqs.py b/images/airflow/2.8.0/python/mwaa/config/sqs.py index b04a0b6..2d04223 100644 --- a/images/airflow/2.8.0/python/mwaa/config/sqs.py +++ b/images/airflow/2.8.0/python/mwaa/config/sqs.py @@ -9,7 +9,7 @@ from mwaa.config.aws import get_aws_region -def _change_protocol_to_sqs(url) -> str: +def _change_protocol_to_sqs(url: str) -> str: """ Make the given SQS endpoint Celery friendly by setting the URL protocol to sqs://. @@ -22,25 +22,23 @@ def _change_protocol_to_sqs(url) -> str: parsed_url = urlparse(url) # Check if the scheme was missing and was defaulted to 'http' - if parsed_url.netloc == '': + if parsed_url.netloc == "": # Scheme is missing, netloc is actually part of the path. # See the documentation for urlparse() if you don't understand the # reasoning. new_netloc = parsed_url.path - new_path = '' + new_path = "" else: # Scheme is present. new_netloc = parsed_url.netloc new_path = parsed_url.path - return urlunparse(parsed_url._replace( - scheme='sqs', - netloc=new_netloc, - path=new_path - )) + return urlunparse( + parsed_url._replace(scheme="sqs", netloc=new_netloc, path=new_path) + ) -def _get_sqs_default_endpoint(): +def _get_sqs_default_endpoint() -> str: """ Retrieves the default SQS endpoint for the current AWS region. """ @@ -49,7 +47,7 @@ def _get_sqs_default_endpoint(): session = boto3.Session(region_name=get_aws_region()) # Create an SQS client from this session - sqs = session.client('sqs') + sqs = session.client("sqs") # Return the endpoint URL return sqs.meta.endpoint_url @@ -63,12 +61,11 @@ def get_sqs_endpoint() -> str: used. """ return _change_protocol_to_sqs( - os.environ.get('MWAA__SQS__CUSTOM_ENDPOINT') - or _get_sqs_default_endpoint() + os.environ.get("MWAA__SQS__CUSTOM_ENDPOINT") or _get_sqs_default_endpoint() ) -def _get_queue_name_from_url(queue_url) -> str: +def _get_queue_name_from_url(queue_url: str) -> str: """ Extracts the queue name from an Amazon SQS queue URL. @@ -78,16 +75,15 @@ def _get_queue_name_from_url(queue_url) -> str: """ try: # Validate the protocol. - if not queue_url.startswith("http://") and \ - not queue_url.startswith("https://"): + if not queue_url.startswith("http://") and not queue_url.startswith("https://"): raise ValueError( - f"URL {queue_url} is should start with http:// or https://") + f"URL {queue_url} is should start with http:// or https://" + ) - parts = queue_url.split('/') + parts = queue_url.split("/") if len(parts) < 2: - raise ValueError( - f"URL {queue_url} is invalid.") + raise ValueError(f"URL {queue_url} is invalid.") return parts[-1] except Exception as e: @@ -98,10 +94,10 @@ def get_sqs_queue_url() -> str: """ Retrieves the URL of the SQS queue specified for use with Celery. """ - env_var_name = 'MWAA__SQS__QUEUE_URL' + env_var_name = "MWAA__SQS__QUEUE_URL" if env_var_name not in os.environ: raise RuntimeError( - "The name of the SQS queue to use should be specified in an " + + "The name of the SQS queue to use should be specified in an " f"environment variable called '{env_var_name}.'" ) return os.environ.get(env_var_name) # type: ignore diff --git a/images/airflow/2.8.0/python/mwaa/entrypoint.py b/images/airflow/2.8.0/python/mwaa/entrypoint.py index ac0e7e3..4ad3654 100644 --- a/images/airflow/2.8.0/python/mwaa/entrypoint.py +++ b/images/airflow/2.8.0/python/mwaa/entrypoint.py @@ -11,6 +11,7 @@ import sys import time import subprocess +from typing import Any, Callable, TypeVar, cast # 3rd party imports @@ -18,94 +19,117 @@ from sqlalchemy.engine import Engine # Our imports -from mwaa.config.airflow import get_airflow_config, get_db_connection_string +from mwaa.config.airflow import get_airflow_config +from mwaa.config.database import get_db_connection_string -def abort(err_msg: str, exit_code=1) -> None: +def abort(err_msg: str, exit_code: int = 1) -> None: print(err_msg) sys.exit(exit_code) AVAILABLE_COMMANDS = [ - 'webserver', - 'scheduler', - 'worker', - 'triggerer', - 'shell', - 'spy', + "webserver", + "scheduler", + "worker", + "triggerer", + "shell", + "spy", ] -def verify_versions(): +def verify_versions() -> None: major, minor, micro, *_ = sys.version_info - assert os.environ['PYTHON_VERSION'] == f'{major}.{minor}.{micro}' + assert os.environ["PYTHON_VERSION"] == f"{major}.{minor}.{micro}" -def db_lock(lock_id, timeout=300 * 1000): - def db_lock_specific(func): - def wrapped(*args, **kwargs): - func_name = func.__name__ +F = TypeVar("F", bound=Callable[..., Any]) + + +def db_lock(lock_id: int, timeout: int = 300 * 1000) -> Callable[[F], F]: + def decorator(func: F) -> F: + def wrapper(*args: Any, **kwargs: Any) -> Any: + func_name: str = func.__name__ db_engine: Engine = create_engine( - get_db_connection_string()) # type: ignore - print(f'Obtaining lock for {func_name}...') + get_db_connection_string() # Assuming this is defined elsewhere + ) + print(f"Obtaining lock for {func_name}...") with db_engine.connect() as conn: try: - conn.execute(text("SET LOCK_TIMEOUT to :timeout"), - {"timeout": timeout}) - conn.execute(text("SELECT pg_advisory_lock(:id)"), - {"id": lock_id}) - print(f'Obtained lock for {func_name}.') + conn.execute( + text("SET LOCK_TIMEOUT to :timeout"), {"timeout": timeout} + ) + conn.execute(text("SELECT pg_advisory_lock(:id)"), {"id": lock_id}) + print(f"Obtained lock for {func_name}.") try: func(*args, **kwargs) except Exception as e: abort( - f'Failed while executing {func_name}. ' + - f'Error: {e}.' - ) + f"Failed while executing {func_name}. " + f"Error: {e}." + ) # Assuming abort is defined elsewhere except Exception as e: abort( - f'Failed to obtain DB lock for {func_name}. ' + - f'Error: {e}.' + f"Failed to obtain DB lock for {func_name}. " + f"Error: {e}." ) finally: - print(f'Releasing lock for {func_name}...') - conn.execute("SET LOCK_TIMEOUT TO DEFAULT") - conn.execute(text("SELECT pg_advisory_unlock(:id)"), { - "id": lock_id}) - print(f'Released lock for {func_name}') - return wrapped - return db_lock_specific + print(f"Releasing lock for {func_name}...") + conn.execute(text("SET LOCK_TIMEOUT TO DEFAULT")) + conn.execute( + text("SELECT pg_advisory_unlock(:id)"), {"id": lock_id} + ) + print(f"Released lock for {func_name}") + + return cast(F, wrapper) + + return decorator @db_lock(1234) -def airflow_db_init(environ): +def airflow_db_init(environ: dict[str, str]) -> None: print("Calling 'airflow db migrate' to initialize the database.") - response = subprocess.run(["airflow db migrate"], - shell=True, check=True, text=True, env=environ) + response = subprocess.run( + ["airflow db migrate"], shell=True, check=True, text=True, env=environ + ) if response.returncode: - raise RuntimeError(f'Failed to migrate db. Error: {response.stderr}') + raise RuntimeError(f"Failed to migrate db. Error: {response.stderr}") @db_lock(5678) -def create_www_user(environ): +def create_www_user(environ: dict[str, str]) -> None: print("Calling 'airflow users create' to create the webserver user.") - response = subprocess.run(' '.join([ - "airflow", "users", "create", - "--username", "airflow", - "--firstname", "Airflow", - "--lastname", "Admin", - "--email", "airflow@example.com", - "--role", "Admin", - "--password", "airflow", - ]), shell=True, check=True, text=True, env=environ) + response = subprocess.run( + " ".join( + [ + "airflow", + "users", + "create", + "--username", + "airflow", + "--firstname", + "Airflow", + "--lastname", + "Admin", + "--email", + "airflow@example.com", + "--role", + "Admin", + "--password", + "airflow", + ] + ), + shell=True, + check=True, + text=True, + env=environ, + ) if response.returncode: - raise RuntimeError(f'Failed to create user. Error: {response.stderr}') + raise RuntimeError(f"Failed to create user. Error: {response.stderr}") -def export_env_variables(environ): +def export_env_variables(environ: dict[str, str]) -> None: # Get the home directory of the current user home_dir = os.path.expanduser("~") bashrc_path = os.path.join(home_dir, ".bashrc") @@ -131,22 +155,26 @@ def main() -> None: """Entrypoint of the script.""" try: - (_, command, ) = sys.argv + ( + _, + command, + ) = sys.argv if command not in AVAILABLE_COMMANDS: - exit(f'Invalid command: {command}. ' + - f'Use one of {", ".join(AVAILABLE_COMMANDS)}.') + exit( + f"Invalid command: {command}. " + f'Use one of {", ".join(AVAILABLE_COMMANDS)}.' + ) except Exception as e: print(sys.argv) - exit('Invalid arguments. Please provide one argument with one of' + - f'the values: {", ".join(AVAILABLE_COMMANDS)}. Error was {e}.') + exit( + "Invalid arguments. Please provide one argument with one of" + f'the values: {", ".join(AVAILABLE_COMMANDS)}. Error was {e}.' + ) print(f"Warming a Docker container for an Airflow {command}.") # Add the necessary environment variables. - environ = { - **os.environ, - **get_airflow_config() - } + environ = {**os.environ, **get_airflow_config()} airflow_db_init(environ) create_www_user(environ) @@ -157,19 +185,19 @@ def main() -> None: export_env_variables(environ) match command: - case 'shell': - os.execlpe('/bin/bash', '/bin/bash', environ) - case 'spy': + case "shell": + os.execlpe("/bin/bash", "/bin/bash", environ) + case "spy": while True: time.sleep(1) - case 'worker': - os.execlpe('airflow', 'airflow', 'celery', 'worker', environ) + case "worker": + os.execlpe("airflow", "airflow", "celery", "worker", environ) case _: - os.execlpe('airflow', 'airflow', command, environ) + os.execlpe("airflow", "airflow", command, environ) -if __name__ == '__main__': +if __name__ == "__main__": main() else: - print('This module cannot be imported.') + print("This module cannot be imported.") sys.exit(1) diff --git a/images/airflow/2.8.0/requirements.txt b/images/airflow/2.8.0/requirements.txt new file mode 100644 index 0000000..db0fbeb --- /dev/null +++ b/images/airflow/2.8.0/requirements.txt @@ -0,0 +1,6 @@ +apache-airflow-providers-amazon[aiobotocore]==8.13.0 +apache-airflow[celery,statsd]==2.8.0 +celery[sqs] +psycopg2 +pycurl +watchtower \ No newline at end of file diff --git a/images/airflow/2.8.0/generate-dockerfiles.py b/images/airflow/generate-dockerfiles.py similarity index 55% rename from images/airflow/2.8.0/generate-dockerfiles.py rename to images/airflow/generate-dockerfiles.py index ad7c421..e1bdac5 100644 --- a/images/airflow/2.8.0/generate-dockerfiles.py +++ b/images/airflow/generate-dockerfiles.py @@ -16,42 +16,29 @@ import os import sys from datetime import datetime +from typing import Any, List +from pathlib import Path try: from jinja2 import Environment, FileSystemLoader except ImportError: - print(''' + print( + """ jinja2 pip library is required. Please install it with: pip3 install jinja2 -'''.strip()) +""".strip() + ) sys.exit(1) -def raise_helper(msg) -> None: +def raise_helper(msg: str) -> None: """ Helper method to enable Jinja2 templates to raise an exception. """ raise RuntimeError(msg) -def is_dev_bootstrapping_step(bootstrap_filename: str) -> bool: - """ - Determines whether the given bootstrap filename is supposed to run only in - development images. This is decided based on the prefix "devonly-" in the - filename directly following the index prefix (the numbers at the - beginning.) For example, the file `200-devonly-install-dev-tools.sh` - matches this criteria and will be only executed for building development - images. - - :param bootstrap_filename: The name of the bootstrapping file. - - :return True or False. - """ - comps = bootstrap_filename.split('-') - return len(comps) > 1 and comps[1] == 'devonly' - - def remove_repeated_empty_lines(text: str) -> str: """ Removes repeated empty lines from a given text, leaving at most one empty @@ -64,7 +51,7 @@ def remove_repeated_empty_lines(text: str) -> str: """ lines = text.split(os.linesep) # Split the text into lines previous_line_empty = False # Track if the previous line was empty - cleaned_lines = [] + cleaned_lines: List[str] = [] for line in lines: # Check if the current line is empty @@ -84,19 +71,20 @@ def remove_repeated_empty_lines(text: str) -> str: return cleaned_text -def generate_dockerfile(template: str, - output_file: str, - data: dict[str, str]) -> None: +def generate_dockerfile( + image_root_dir: Path, template_filename: str, output_file: str, data: dict[str, Any] +) -> None: # Load Dockerfile Jinja template. - file_loader = FileSystemLoader('.') - env = Environment(loader=file_loader) - env.globals['raise'] = raise_helper - template = env.get_template(template) + file_loader = FileSystemLoader(image_root_dir) + env = Environment(loader=file_loader, autoescape=True) + env.globals["raise"] = raise_helper # type: ignore + template = env.get_template(template_filename) # Render the template and generate the Dockerfile output = template.render(data) - with open(os.path.join('./Dockerfiles', output_file), 'w') as f: - f.write(f''' + with open(os.path.join(image_root_dir, "Dockerfiles", output_file), "w") as f: + f.write( + f""" # # WARNING: Don't change this file manually. This file is auto-generated from # the Jinja2-templated Dockerfile.j2 file, so you need to change that file @@ -104,41 +92,44 @@ def generate_dockerfile(template: str, # # This file was generated on {datetime.now()} # - '''.strip()) + """.strip() + ) f.write(os.linesep) f.write(os.linesep) f.write(remove_repeated_empty_lines(output)) -def generate_base_dockerfile() -> None: +def generate_base_dockerfile(image_root_dir: Path) -> None: + """Generate the Dockerfile.base file based on the Dockerfile.base.j2 + template.""" # Template data data = { - 'bootstrapping_scripts_root_firstpass': [ - os.path.join('/bootstrap/01-root-firstpass', name).strip() - for name in sorted(os.listdir('./bootstrap/01-root-firstpass')) - if not is_dev_bootstrapping_step(name) or dev is True + "bootstrapping_scripts_root_firstpass": [ + os.path.join("/bootstrap/01-root-firstpass", file.name) + for file in (image_root_dir / "bootstrap/01-root-firstpass").iterdir() + if file.is_file() ], - 'bootstrapping_scripts_airflow': [ - os.path.join('/bootstrap/02-airflow', name).strip() - for name in sorted(os.listdir('./bootstrap/02-airflow')) - if not is_dev_bootstrapping_step(name) or dev is True + "bootstrapping_scripts_airflow": [ + os.path.join("/bootstrap/02-airflow", file.name) + for file in (image_root_dir / "bootstrap/02-airflow").iterdir() + if file.is_file() ], - 'bootstrapping_scripts_root_secondpass': [ - os.path.join('/bootstrap/03-root-secondpass', name).strip() - for name in sorted(os.listdir('./bootstrap/03-root-secondpass')) - if not is_dev_bootstrapping_step(name) or dev is True + "bootstrapping_scripts_root_secondpass": [ + os.path.join("/bootstrap/03-root-secondpass", file.name) + for file in (image_root_dir / "bootstrap/03-root-secondpass").iterdir() + if file.is_file() ], } - template_name = 'Dockerfile.base.j2' - dockerfile_name = 'Dockerfile.base' - generate_dockerfile(template_name, dockerfile_name, data) + template_name = "Dockerfile.base.j2" + dockerfile_name = "Dockerfile.base" + generate_dockerfile(image_root_dir, template_name, dockerfile_name, data) -def generate_derivative_dockerfiles(build_type: str = 'standard', - dev: bool = False) -> None: - """ - Generate a Dockerfile based on the given build arguments. +def generate_derivative_dockerfiles( + image_root_dir: Path, build_type: str = "standard", dev: bool = False +) -> None: + """Generate a Dockerfile based on the given build arguments. :param build_type: Specifies the build type. This can have the following values: @@ -157,32 +148,50 @@ def generate_derivative_dockerfiles(build_type: str = 'standard', e.g. editors, sudo, etc. """ - template_name = 'Dockerfile.derivatives.j2' - dockerfile_name = 'Dockerfile' - if build_type != 'standard': - dockerfile_name = f'{dockerfile_name}-{build_type}' + template_name = "Dockerfile.derivatives.j2" + dockerfile_name = "Dockerfile" + if build_type != "standard": + dockerfile_name = f"{dockerfile_name}-{build_type}" if dev: - dockerfile_name = f'{dockerfile_name}-dev' + dockerfile_name = f"{dockerfile_name}-dev" data = { - 'bootstrapping_scripts_dev': [ - os.path.join('/bootstrap-dev', name).strip() - for name in sorted(os.listdir('./bootstrap-dev')) - ] if dev else [], - 'build_type': build_type, + "bootstrapping_scripts_dev": ( + [ + os.path.join("/bootstrap-dev", file.name) + for file in (image_root_dir / "bootstrap-dev").iterdir() + if file.is_file() + ] + if dev + else [] + ), + "build_type": build_type, } - generate_dockerfile(template_name, dockerfile_name, data) + generate_dockerfile(image_root_dir, template_name, dockerfile_name, data) -if __name__ == '__main__': +def generate_airflow_dockerfiles(image_root_dir: Path): # Generate the base Dockerfile file (Dockerfile.base). - generate_base_dockerfile() + generate_base_dockerfile(image_root_dir) # Generate the derivative Dockerfiles (multiple Dockerfiles based on # the build arguments.) for dev in [True, False]: - for build_type in ['standard', 'explorer', 'explorer-privileged']: - generate_derivative_dockerfiles(build_type=build_type, dev=dev) + for build_type in ["standard", "explorer", "explorer-privileged"]: + generate_derivative_dockerfiles( + image_root_dir, build_type=build_type, dev=dev + ) + + +def main(): + for x in Path(__file__).parent.iterdir(): + if not x.is_dir(): + continue + generate_airflow_dockerfiles(x) + + +if __name__ == "__main__": + main() else: - print('This module cannot be imported.') + print("This module cannot be imported.") sys.exit(1) diff --git a/pyrightconfig.json b/pyrightconfig.json new file mode 100644 index 0000000..8e2ae3c --- /dev/null +++ b/pyrightconfig.json @@ -0,0 +1,14 @@ +{ + "include": ["./"], + "exclude": [ + "**/__pycache__", + "**/.venv", + // We ignore the ./images folder since it contains Docker images that + // have their own different Python packages and thus require runs. + "./images" + ], + "strict": ["./"], + "pythonVersion": "3.11", + "pythonPlatform": "All", + "typeCheckingMode": "strict" +} diff --git a/quality-checks/lint_python.sh b/quality-checks/lint_python.sh index 0572d5a..ff65c3c 100755 --- a/quality-checks/lint_python.sh +++ b/quality-checks/lint_python.sh @@ -9,11 +9,36 @@ if [[ "$PWD" != "$REPO_ROOT" ]]; then exit 1 fi -# Lint all Python files -echo "Running Flake8 on Python files..." -if ! flake8 .; then - echo "Flake8 linting failed." - exit 1 -else - echo "Flake8 linting passed." -fi +check_dir() { + local dir=$1 # Directory to work in + local venv_dir="${dir}/.venv" # virtual environment path + + echo "Checking directory ${dir}." + + # Check if virtualenv exists, if not create it and install dependencies + if [[ ! -d "$venv_dir" ]]; then + echo "Virtual environment doesn't exist at ${venv_dir}. Please run the script ./create_venvs.py." + exit 1 + fi + + # shellcheck source=/dev/null + source "${venv_dir}/bin/activate" + # Run ruff and Pyright + echo "Running ruff..." + ruff "${dir}" + echo "Running Pyright..." + pyright "${dir}" + deactivate + + echo +} + +# Main repo setup and checks +check_dir "." + +# Setup and checks for each Docker image under ./images/airflow +for image_dir in ./images/airflow/*; do + if [[ -d "$image_dir" ]]; then + check_dir "$image_dir" + fi +done diff --git a/quality-checks/pip_install_check.py b/quality-checks/pip_install_check.py index 7f0dfa9..d7fbc0e 100755 --- a/quality-checks/pip_install_check.py +++ b/quality-checks/pip_install_check.py @@ -3,8 +3,8 @@ import sys -EMJOI_CHECK_MARK_BUTTON = '\u2705' -EMJOI_CROSS_MARK = '\u274C' +EMJOI_CHECK_MARK_BUTTON = "\u2705" +EMJOI_CROSS_MARK = "\u274C" def check_file_for_pip_install(filepath: str) -> bool: @@ -15,9 +15,9 @@ def check_file_for_pip_install(filepath: str) -> bool: :returns True if the check passes (no 'pip install' found), else False. """ - with open(filepath, 'r') as file: + with open(filepath, "r") as file: for line in file: - if 'pip install' in line: + if "pip install" in line: return False return True @@ -34,13 +34,13 @@ def verify_no_pip_install(directory: str) -> bool: # Check if the directory exists if not os.path.isdir(directory): print(f"The directory {directory} does not exist.") - return + return True # Walk through the directory tree ret_code = True - for root, dirs, files in os.walk(directory): + for root, _dirs, files in os.walk(directory): for filename in files: - if filename.endswith('.sh'): # Check for bash scripts + if filename.endswith(".sh"): # Check for bash scripts filepath = os.path.join(root, filename) if check_file_for_pip_install(filepath): print(f"{EMJOI_CHECK_MARK_BUTTON} {filepath}") @@ -59,25 +59,29 @@ def verify_in_repo_root() -> None: # Determine the script's directory and the parent directory (which should # be ) script_dir = os.path.dirname(os.path.realpath(__file__)) - repo_root = os.path.abspath(os.path.join(script_dir, '..')) + repo_root = os.path.abspath(os.path.join(script_dir, "..")) # Check if the current working directory is the repo root if os.getcwd() != repo_root: - print("The script must be run from the repo root. Please cd into " + - "the repo root directory and then type: " + - f"./quality-checks/{os.path.basename(__file__)}.") + print( + "The script must be run from the repo root. Please cd into " + "the repo root directory and then type: " + f"./quality-checks/{os.path.basename(__file__)}." + ) sys.exit(1) def main() -> None: verify_in_repo_root() - if verify_no_pip_install('./'): + if verify_no_pip_install("./"): sys.exit(0) else: - print("Some files failed the check. Please ensure you are using " + - "`safe-pip-install` in those files instead of directly " + - "calling `pip install`.") + print( + "Some files failed the check. Please ensure you are using " + "`safe-pip-install` in those files instead of directly " + "calling `pip install`." + ) sys.exit(1) diff --git a/quality-checks/run_all.py b/quality-checks/run_all.py index b1bb58d..ff6b7fc 100755 --- a/quality-checks/run_all.py +++ b/quality-checks/run_all.py @@ -2,14 +2,22 @@ import os import subprocess import sys +from typing import List -def prefix_output(file, process): +# NOTE Ideally, we should be specifying the typing annotation for 'process' to +# `subprocess.Popne[bytes]`. However, this requires Python 3.9+ and Amazon Linux 2 +# is still on 3.7/3.8. +# TODO Remove support of Amazon Linux 2 from this package as soon as possible, as we +# shouldn't be relying on an EOLed Python version. +def prefix_output(file: str, process: subprocess.Popen) -> None: # type: ignore """ Prefix each line of output with the filename. """ - for line in process.stdout: - print(f"[{file}] {line.decode().strip()}") + if not process.stdout: # type: ignore + raise RuntimeError("Process doesn't have an stdout stream.") + for line in process.stdout: # type: ignore + print(f"[{file}] {line.decode().strip()}") # type: ignore def verify_in_repo_root() -> None: @@ -20,13 +28,15 @@ def verify_in_repo_root() -> None: # Determine the script's directory and the parent directory (which should # be ) script_dir = os.path.dirname(os.path.realpath(__file__)) - repo_root = os.path.abspath(os.path.join(script_dir, '..')) + repo_root = os.path.abspath(os.path.join(script_dir, "..")) # Check if the current working directory is the repo root if os.getcwd() != repo_root: - print("The script must be run from the repo root. Please cd into " + - "the repo root directory and then type: " + - f"./quality-checks/{os.path.basename(__file__)}.") + print( + "The script must be run from the repo root. Please cd into " + "the repo root directory and then type: " + f"./quality-checks/{os.path.basename(__file__)}." + ) sys.exit(1) @@ -36,8 +46,8 @@ def main() -> None: """ verify_in_repo_root() - quality_checks_dir = './quality-checks/' - failed_scripts = [] + quality_checks_dir = "./quality-checks/" + failed_scripts: List[str] = [] # Iterate over every file in the quality-checks directory for file in os.listdir(quality_checks_dir): @@ -50,25 +60,26 @@ def main() -> None: # Check if the file is executable if os.access(filepath, os.X_OK): print(f"Executing: {file}") - with subprocess.Popen(filepath, - shell=True, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT) as process: + with subprocess.Popen( + filepath, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT + ) as process: prefix_output(file, process) process.wait() # Wait for the process to complete # Check the exit status of the script if process.returncode != 0: - print(f"Script {file} failed with exit status " + - f"{process.returncode}.") + print( + f"Script {file} failed with exit status " + f"{process.returncode}." + ) failed_scripts.append(file) print() # Exit with a non-zero status if any script failed if failed_scripts: - print('The following scripts failed:') + print("The following scripts failed:") for fs in failed_scripts: - print(f'- {fs}') + print(f"- {fs}") sys.exit(1) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..511709c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,12 @@ +# This requirements.txt file is intended for the repository's scripts and +# quality checks, not for the Docker images contained within. It is required for +# running repository scripts and maintenance tasks, such as generating +# Dockerfiles, linting, testing, and other scripts that help ensure code +# quality. +Jinja2==3.1.3 +MarkupSafe==2.1.5 +nodeenv==1.8.0 +pyright==1.1.349 +ruff==0.2.0 +setuptools==68.2.2 +wheel==0.41.2 \ No newline at end of file From c83b9ed8fd8c5e21df3bf862192bfc0ce2b743fa Mon Sep 17 00:00:00 2001 From: Rafid K Date: Tue, 6 Feb 2024 03:20:44 +0000 Subject: [PATCH 07/11] Support installing custom user pip requirements --- .vscode/launch.json | 15 ++++++ create_venvs.py | 0 images/airflow/2.8.0/.dockerignore | 1 + .../2.8.0/bin/airflow-user/safe-pip-install | 1 - images/airflow/2.8.0/build.sh | 7 ++- images/airflow/2.8.0/docker-compose.yaml | 4 ++ .../airflow/2.8.0/python/mwaa/entrypoint.py | 18 +++++++ images/airflow/2.8.0/requirements.txt | 3 ++ .../2.8.0/requirements/requirements.txt | 1 + images/airflow/generate-dockerfiles.py | 48 +++++++++++-------- quality-checks/lint_python.sh | 3 +- 11 files changed, 77 insertions(+), 24 deletions(-) create mode 100644 .vscode/launch.json mode change 100755 => 100644 create_venvs.py create mode 100644 images/airflow/2.8.0/.dockerignore create mode 100644 images/airflow/2.8.0/requirements/requirements.txt diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..cb8c5e2 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,15 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Generate Airflow Dockerfiles", + "type": "debugpy", + "request": "launch", + "program": "./images/airflow/generate-dockerfiles.py", + "console": "integratedTerminal" + } + ] +} diff --git a/create_venvs.py b/create_venvs.py old mode 100755 new mode 100644 diff --git a/images/airflow/2.8.0/.dockerignore b/images/airflow/2.8.0/.dockerignore new file mode 100644 index 0000000..b694934 --- /dev/null +++ b/images/airflow/2.8.0/.dockerignore @@ -0,0 +1 @@ +.venv \ No newline at end of file diff --git a/images/airflow/2.8.0/bin/airflow-user/safe-pip-install b/images/airflow/2.8.0/bin/airflow-user/safe-pip-install index c088913..7fb324c 100644 --- a/images/airflow/2.8.0/bin/airflow-user/safe-pip-install +++ b/images/airflow/2.8.0/bin/airflow-user/safe-pip-install @@ -1,7 +1,6 @@ #!/bin/bash # Define an array of required packages -# TODO Remove this and use requirements.txt. REQUIRED_PACKAGES=( "apache-airflow-providers-amazon[aiobotocore]==${AIRFLOW_AMAZON_PROVIDERS_VERSION}" "apache-airflow[celery,statsd]==${AIRFLOW_VERSION}" diff --git a/images/airflow/2.8.0/build.sh b/images/airflow/2.8.0/build.sh index c82df21..3e7a8cb 100755 --- a/images/airflow/2.8.0/build.sh +++ b/images/airflow/2.8.0/build.sh @@ -2,7 +2,10 @@ set -e # Generate the Dockerfiles from the templates. -python3 generate-dockerfiles.py +# shellcheck source=/dev/null +source "../../../.venv/bin/activate" +python3 ../generate-dockerfiles.py +deactivate # Build the base image. docker build -f ./Dockerfiles/Dockerfile.base -t amazon-mwaa/airflow:2.8.0-base ./ @@ -25,4 +28,4 @@ for dev in "True" "False"; do docker build -f "./Dockerfiles/${dockerfile_name}" -t "${tag_name}" ./ done -done \ No newline at end of file +done diff --git a/images/airflow/2.8.0/docker-compose.yaml b/images/airflow/2.8.0/docker-compose.yaml index 3af4d77..00115e1 100644 --- a/images/airflow/2.8.0/docker-compose.yaml +++ b/images/airflow/2.8.0/docker-compose.yaml @@ -12,6 +12,9 @@ x-airflow-common: &airflow-common AWS_REGION: ${AWS_REGION} AWS_DEFAULT_REGION: ${AWS_REGION} + # Core configuration + MWAA__CORE__REQUIREMENTS_PATH: "/usr/local/airflow/requirements/requirements.txt" + # Database configuration MWAA__DB__POSTGRES_HOST: "postgres" MWAA__DB__POSTGRES_PORT: "5432" @@ -25,6 +28,7 @@ x-airflow-common: &airflow-common volumes: - ./dags:/usr/local/airflow/dags - ./plugins:/usr/local/airflow/plugins + - ./requirements:/usr/local/airflow/requirements depends_on: &airflow-common-depends-on postgres: condition: service_healthy diff --git a/images/airflow/2.8.0/python/mwaa/entrypoint.py b/images/airflow/2.8.0/python/mwaa/entrypoint.py index 4ad3654..843726a 100644 --- a/images/airflow/2.8.0/python/mwaa/entrypoint.py +++ b/images/airflow/2.8.0/python/mwaa/entrypoint.py @@ -129,6 +129,23 @@ def create_www_user(environ: dict[str, str]) -> None: raise RuntimeError(f"Failed to create user. Error: {response.stderr}") +def install_user_requirements(environ: dict[str, str]) -> None: + requirements_file = environ.get("MWAA__CORE__REQUIREMENTS_PATH") + print(f"MWAA__CORE__REQUIREMENTS_PATH = {requirements_file}") + if requirements_file and os.path.isfile(requirements_file): + print(f"Installing user requirements from {requirements_file}...") + subprocess.run( + [ + "safe-pip-install", + "-r", + str(requirements_file), + ], + check=True, + ) + else: + print("No user requirements to install.") + + def export_env_variables(environ: dict[str, str]) -> None: # Get the home directory of the current user home_dir = os.path.expanduser("~") @@ -178,6 +195,7 @@ def main() -> None: airflow_db_init(environ) create_www_user(environ) + install_user_requirements(environ) # Export the environment variables to .bashrc and .bash_profile to enable # users to run a shell on the container and have the necessary environment diff --git a/images/airflow/2.8.0/requirements.txt b/images/airflow/2.8.0/requirements.txt index db0fbeb..1572cd7 100644 --- a/images/airflow/2.8.0/requirements.txt +++ b/images/airflow/2.8.0/requirements.txt @@ -3,4 +3,7 @@ apache-airflow[celery,statsd]==2.8.0 celery[sqs] psycopg2 pycurl +pyright +ruff +sqlalchemy-stubs watchtower \ No newline at end of file diff --git a/images/airflow/2.8.0/requirements/requirements.txt b/images/airflow/2.8.0/requirements/requirements.txt new file mode 100644 index 0000000..9a66a40 --- /dev/null +++ b/images/airflow/2.8.0/requirements/requirements.txt @@ -0,0 +1 @@ +# Add your requirements here \ No newline at end of file diff --git a/images/airflow/generate-dockerfiles.py b/images/airflow/generate-dockerfiles.py index e1bdac5..c91ce58 100644 --- a/images/airflow/generate-dockerfiles.py +++ b/images/airflow/generate-dockerfiles.py @@ -104,21 +104,27 @@ def generate_base_dockerfile(image_root_dir: Path) -> None: template.""" # Template data data = { - "bootstrapping_scripts_root_firstpass": [ - os.path.join("/bootstrap/01-root-firstpass", file.name) - for file in (image_root_dir / "bootstrap/01-root-firstpass").iterdir() - if file.is_file() - ], - "bootstrapping_scripts_airflow": [ - os.path.join("/bootstrap/02-airflow", file.name) - for file in (image_root_dir / "bootstrap/02-airflow").iterdir() - if file.is_file() - ], - "bootstrapping_scripts_root_secondpass": [ - os.path.join("/bootstrap/03-root-secondpass", file.name) - for file in (image_root_dir / "bootstrap/03-root-secondpass").iterdir() - if file.is_file() - ], + "bootstrapping_scripts_root_firstpass": sorted( + [ + os.path.join("/bootstrap/01-root-firstpass", file.name) + for file in (image_root_dir / "bootstrap/01-root-firstpass").iterdir() + if file.is_file() + ] + ), + "bootstrapping_scripts_airflow": sorted( + [ + os.path.join("/bootstrap/02-airflow", file.name) + for file in (image_root_dir / "bootstrap/02-airflow").iterdir() + if file.is_file() + ] + ), + "bootstrapping_scripts_root_secondpass": sorted( + [ + os.path.join("/bootstrap/03-root-secondpass", file.name) + for file in (image_root_dir / "bootstrap/03-root-secondpass").iterdir() + if file.is_file() + ] + ), } template_name = "Dockerfile.base.j2" @@ -156,11 +162,13 @@ def generate_derivative_dockerfiles( dockerfile_name = f"{dockerfile_name}-dev" data = { "bootstrapping_scripts_dev": ( - [ - os.path.join("/bootstrap-dev", file.name) - for file in (image_root_dir / "bootstrap-dev").iterdir() - if file.is_file() - ] + sorted( + [ + os.path.join("/bootstrap-dev", file.name) + for file in (image_root_dir / "bootstrap-dev").iterdir() + if file.is_file() + ] + ) if dev else [] ), diff --git a/quality-checks/lint_python.sh b/quality-checks/lint_python.sh index ff65c3c..039440d 100755 --- a/quality-checks/lint_python.sh +++ b/quality-checks/lint_python.sh @@ -1,4 +1,5 @@ #!/bin/bash +set -e # Ensure the script is being executed while being in the repo root. SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" @@ -13,7 +14,7 @@ check_dir() { local dir=$1 # Directory to work in local venv_dir="${dir}/.venv" # virtual environment path - echo "Checking directory ${dir}." + echo "Checking directory \"${dir}\"..." # Check if virtualenv exists, if not create it and install dependencies if [[ ! -d "$venv_dir" ]]; then From 85438ed3ba95954949212126eb70557d565a4b86 Mon Sep 17 00:00:00 2001 From: Rafid K Date: Mon, 12 Feb 2024 02:02:26 +0000 Subject: [PATCH 08/11] Changes required to deploy the image using MWAA These open source Docker images will be used both externally by our customers willing to experiment with the images in native Docker and internally within an Amazon MWAA setup (which relies on Fargate.) This commit involves multiple small changes to make this possible: - Introduced a `/healthcheck.sh` script which is used by Fargate to monitor health status. This script currently always return success status (0 code) just to make the integration possible. In the future, we need to: - Improve this script to do some real checks. - Move this script to a better location (scripts shouldn't be placed at the root.) - Supported reading database credentials from a JSON-formatted environment variable, `MWAA__DB__CREDENTIALS`, containing the username and password. This is needed because Amazon MWAA employs Secrets Manager to pass the credentials safely to the Fargate container in a JSON-formatted object. During the work on this, I temporarily downgraded the Airflow version to 2.7.2 since this a version we internally support, which should make the testing easier. --- .vscode/settings.json | 11 ++- images/airflow/2.8.0/.vscode/settings.json | 11 ++- images/airflow/2.8.0/Dockerfile.base.j2 | 15 +++- images/airflow/2.8.0/Dockerfiles/Dockerfile | 2 +- .../airflow/2.8.0/Dockerfiles/Dockerfile-dev | 2 +- .../2.8.0/Dockerfiles/Dockerfile-explorer | 2 +- .../2.8.0/Dockerfiles/Dockerfile-explorer-dev | 2 +- .../Dockerfile-explorer-privileged | 2 +- .../Dockerfile-explorer-privileged-dev | 2 +- .../airflow/2.8.0/Dockerfiles/Dockerfile.base | 18 +++-- images/airflow/2.8.0/healthcheck.sh | 4 ++ .../2.8.0/python/mwaa/config/database.py | 68 +++++++++++++++++-- 12 files changed, 119 insertions(+), 20 deletions(-) create mode 100644 images/airflow/2.8.0/healthcheck.sh diff --git a/.vscode/settings.json b/.vscode/settings.json index 6385e6c..2d5f82e 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -8,6 +8,15 @@ "**/Thumbs.db": true, "**/venv": true }, + "files.watcherExclude": { + "**/.DS_Store": true, + "**/.conda": true, + "**/.git": true, + "**/.ruff_cache": true, + "**/.venv": true, + "**/Thumbs.db": true, + "**/venv": true + }, "search.exclude": { "**/.DS_Store": true, "**/.conda": true, @@ -17,5 +26,5 @@ "**/Thumbs.db": true, "**/venv": true }, - "python.defaultInterpreterPath": "./venv/bin/python" + "python.defaultInterpreterPath": "./.venv/bin/python" } diff --git a/images/airflow/2.8.0/.vscode/settings.json b/images/airflow/2.8.0/.vscode/settings.json index 6385e6c..2d5f82e 100644 --- a/images/airflow/2.8.0/.vscode/settings.json +++ b/images/airflow/2.8.0/.vscode/settings.json @@ -8,6 +8,15 @@ "**/Thumbs.db": true, "**/venv": true }, + "files.watcherExclude": { + "**/.DS_Store": true, + "**/.conda": true, + "**/.git": true, + "**/.ruff_cache": true, + "**/.venv": true, + "**/Thumbs.db": true, + "**/venv": true + }, "search.exclude": { "**/.DS_Store": true, "**/.conda": true, @@ -17,5 +26,5 @@ "**/Thumbs.db": true, "**/venv": true }, - "python.defaultInterpreterPath": "./venv/bin/python" + "python.defaultInterpreterPath": "./.venv/bin/python" } diff --git a/images/airflow/2.8.0/Dockerfile.base.j2 b/images/airflow/2.8.0/Dockerfile.base.j2 index 5f7f967..0cbd167 100644 --- a/images/airflow/2.8.0/Dockerfile.base.j2 +++ b/images/airflow/2.8.0/Dockerfile.base.j2 @@ -1,11 +1,15 @@ FROM public.ecr.aws/amazonlinux/amazonlinux:2023 # Environment variables -ENV AIRFLOW_AMAZON_PROVIDERS_VERSION=8.13.0 -ENV AIRFLOW_CONSTRAINTS_FILE="https://raw.githubusercontent.com/apache/airflow/constraints-2.8.0/constraints-3.11.txt" + +# Temporarily downgrading to 2.7.2 to make it easier to test the Docker image +# within Amazon MWAA since 2.7.2 is a version we support. +ENV AIRFLOW_VERSION=2.7.2 +ENV AIRFLOW_AMAZON_PROVIDERS_VERSION=8.7.1 + +ENV AIRFLOW_CONSTRAINTS_FILE="https://raw.githubusercontent.com/apache/airflow/constraints-2.7.2/constraints-3.11.txt" ENV AIRFLOW_USER_HOME=/usr/local/airflow ENV AIRFLOW_HOME=${AIRFLOW_USER_HOME} -ENV AIRFLOW_VERSION=2.8.0 ENV MWAA_HOME=/usr/local/mwaa ENV PYTHON_VERSION=3.11.7 @@ -96,8 +100,13 @@ EXPOSE 8080 ENV PATH=${PATH_AIRFLOW_USER} ENV PYTHONPATH="/python" +ENV PYTHONUNBUFFERED=1 WORKDIR ${AIRFLOW_USER_HOME} # Copy python files. COPY ./python /python + +# TODO Move this to the bin folder under airflow's home folder. +COPY healthcheck.sh /healthcheck.sh +RUN chmod +x /healthcheck.sh \ No newline at end of file diff --git a/images/airflow/2.8.0/Dockerfiles/Dockerfile b/images/airflow/2.8.0/Dockerfiles/Dockerfile index 12112d5..0581364 100644 --- a/images/airflow/2.8.0/Dockerfiles/Dockerfile +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile @@ -3,7 +3,7 @@ # the Jinja2-templated Dockerfile.j2 file, so you need to change that file # instead. # -# This file was generated on 2024-01-31 20:02:12.342796 +# This file was generated on 2024-02-12 01:56:33.029839 # FROM amazon-mwaa/airflow:2.8.0-base diff --git a/images/airflow/2.8.0/Dockerfiles/Dockerfile-dev b/images/airflow/2.8.0/Dockerfiles/Dockerfile-dev index 4afee30..f4c0c5f 100644 --- a/images/airflow/2.8.0/Dockerfiles/Dockerfile-dev +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile-dev @@ -3,7 +3,7 @@ # the Jinja2-templated Dockerfile.j2 file, so you need to change that file # instead. # -# This file was generated on 2024-01-31 20:02:12.334771 +# This file was generated on 2024-02-12 01:56:33.021778 # FROM amazon-mwaa/airflow:2.8.0-base diff --git a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer index 702f5e2..96c6a1f 100644 --- a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer @@ -3,7 +3,7 @@ # the Jinja2-templated Dockerfile.j2 file, so you need to change that file # instead. # -# This file was generated on 2024-01-31 20:02:12.345378 +# This file was generated on 2024-02-12 01:56:33.032499 # FROM amazon-mwaa/airflow:2.8.0-base diff --git a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-dev b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-dev index 6f326d8..9d285d8 100644 --- a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-dev +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-dev @@ -3,7 +3,7 @@ # the Jinja2-templated Dockerfile.j2 file, so you need to change that file # instead. # -# This file was generated on 2024-01-31 20:02:12.337537 +# This file was generated on 2024-02-12 01:56:33.024518 # FROM amazon-mwaa/airflow:2.8.0-base diff --git a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged index f8b6b1e..6ef5f4d 100644 --- a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged @@ -3,7 +3,7 @@ # the Jinja2-templated Dockerfile.j2 file, so you need to change that file # instead. # -# This file was generated on 2024-01-31 20:02:12.348042 +# This file was generated on 2024-02-12 01:56:33.035092 # FROM amazon-mwaa/airflow:2.8.0-base diff --git a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged-dev b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged-dev index 9b6022e..1ce50d9 100644 --- a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged-dev +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged-dev @@ -3,7 +3,7 @@ # the Jinja2-templated Dockerfile.j2 file, so you need to change that file # instead. # -# This file was generated on 2024-01-31 20:02:12.340210 +# This file was generated on 2024-02-12 01:56:33.027225 # FROM amazon-mwaa/airflow:2.8.0-base diff --git a/images/airflow/2.8.0/Dockerfiles/Dockerfile.base b/images/airflow/2.8.0/Dockerfiles/Dockerfile.base index 6a3f3d4..228a8d5 100644 --- a/images/airflow/2.8.0/Dockerfiles/Dockerfile.base +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile.base @@ -3,17 +3,20 @@ # the Jinja2-templated Dockerfile.j2 file, so you need to change that file # instead. # -# This file was generated on 2024-01-31 20:02:12.330369 +# This file was generated on 2024-02-12 01:56:33.018473 # FROM public.ecr.aws/amazonlinux/amazonlinux:2023 # Environment variables -ENV AIRFLOW_AMAZON_PROVIDERS_VERSION=8.13.0 -ENV AIRFLOW_CONSTRAINTS_FILE="https://raw.githubusercontent.com/apache/airflow/constraints-2.8.0/constraints-3.11.txt" + +# Temporarily downgrading to 2.7.2 to make it easier to test using it internally. +ENV AIRFLOW_VERSION=2.7.2 +ENV AIRFLOW_AMAZON_PROVIDERS_VERSION=8.7.1 + +ENV AIRFLOW_CONSTRAINTS_FILE="https://raw.githubusercontent.com/apache/airflow/constraints-2.7.2/constraints-3.11.txt" ENV AIRFLOW_USER_HOME=/usr/local/airflow ENV AIRFLOW_HOME=${AIRFLOW_USER_HOME} -ENV AIRFLOW_VERSION=2.8.0 ENV MWAA_HOME=/usr/local/mwaa ENV PYTHON_VERSION=3.11.7 @@ -108,8 +111,13 @@ EXPOSE 8080 ENV PATH=${PATH_AIRFLOW_USER} ENV PYTHONPATH="/python" +ENV PYTHONUNBUFFERED=1 WORKDIR ${AIRFLOW_USER_HOME} # Copy python files. -COPY ./python /python \ No newline at end of file +COPY ./python /python + +# TODO Move this to the bin folder under airflow's home folder. +COPY healthcheck.sh /healthcheck.sh +RUN chmod +x /healthcheck.sh \ No newline at end of file diff --git a/images/airflow/2.8.0/healthcheck.sh b/images/airflow/2.8.0/healthcheck.sh new file mode 100644 index 0000000..ecb709f --- /dev/null +++ b/images/airflow/2.8.0/healthcheck.sh @@ -0,0 +1,4 @@ +#!/bin/bash +# TODO Move this to the bin folder under airflow's home folder. +echo "Health check succeeded." +exit 0 diff --git a/images/airflow/2.8.0/python/mwaa/config/database.py b/images/airflow/2.8.0/python/mwaa/config/database.py index f149748..fd4faee 100644 --- a/images/airflow/2.8.0/python/mwaa/config/database.py +++ b/images/airflow/2.8.0/python/mwaa/config/database.py @@ -1,5 +1,68 @@ import os +import json from operator import itemgetter +from typing import Tuple + + +def get_db_credentials() -> Tuple[str, str]: + """ + Retrieves database credentials from environment variables. + + This function looks for database credentials in two possible locations within the + environment variables: + 1. MWAA__DB__CREDENTIALS: expects a JSON string containing "username" and "password" + keys. + 2. MWAA__DB__POSTGRES_USER and MWAA__DB__POSTGRES_PASSWORD: separate environment + variables for the username and password. + + The function first checks for the presence of "MWAA__DB__CREDENTIALS". If found, it + parses the JSON string to extract the username and password. If not found, it then + looks for the "MWAA__DB__POSTGRES_USER" and "MWAA__DB__POSTGRES_PASSWORD" + environment variables. + + If neither method finds the credentials, a RuntimeError is raised indicating the + absence of necessary environment variables for database connection. + + Returns: + Tuple[str, str]: A tuple containing the PostgreSQL username and password. + + Raises: + RuntimeError: If neither MWAA__DB__CREDENTIALS nor MWAA__DB__POSTGRES_USER and + MWAA__DB__POSTGRES_PASSWORD environment variables are set, indicating that the + database credentials are not provided. + + Example: + To use this function, ensure that the required environment variables are set in + your environment before calling it. Then, you can retrieve the credentials as + follows: + + >>> user, password = get_db_credentials() + >>> print(f"Username: {user}, Password: {password}") + """ + + if "MWAA__DB__CREDENTIALS" in os.environ: + print("Reading database credentilas from MWAA__DB__CREDENTIALS.") + db_secrets = json.loads(os.environ["MWAA__DB__CREDENTIALS"]) + postgres_user = db_secrets["username"] + postgres_password = db_secrets["password"] + elif ( + "MWAA__DB__POSTGRES_USER" in os.environ + and "MWAA__DB__POSTGRES_PASSWORD" in os.environ + ): + print( + "Reading database credentilas from MWAA__DB__POSTGRES_USER/ " + "MWAA__DB__POSTGRES_USER environment variables." + ) + postgres_user = os.environ["MWAA__DB__POSTGRES_USER"] + postgres_password = os.environ["MWAA__DB__POSTGRES_PASSWORD"] + else: + raise RuntimeError( + "Couldn't find database credentials in environment variables. " + "Please pass them either in MWAA__DB__CREDENTIALS as a JSON with " + "'username' and 'password' fields, or in MWAA__DB__POSTGRES_USER " + "and MWAA__DB__POSTGRES_PASSWORD." + ) + return postgres_user, postgres_password def get_db_connection_string() -> str: @@ -11,18 +74,15 @@ def get_db_connection_string() -> str: env_vars_names = [ "MWAA__DB__POSTGRES_HOST", "MWAA__DB__POSTGRES_PORT", - "MWAA__DB__POSTGRES_USER", - "MWAA__DB__POSTGRES_PASSWORD", "MWAA__DB__POSTGRES_DB", ] try: ( postgres_host, postgres_port, - postgres_user, - postgres_password, postgres_db, ) = itemgetter(*env_vars_names)(os.environ) + (postgres_user, postgres_password) = get_db_credentials() except Exception as e: raise RuntimeError( "One or more of the required environment variables for " From 752b4bfa029dfaf6e96f671d330f2fec1d0ba696 Mon Sep 17 00:00:00 2001 From: Rafid K Date: Thu, 22 Feb 2024 23:15:57 +0000 Subject: [PATCH 09/11] Make the Docker Compose setup work with elasticmq To make the setup work without having to have an actual SQS account, I made the necessary changes to use a local SQS queue server served by elasticmq. --- images/airflow/2.8.0/Dockerfiles/Dockerfile | 2 +- .../airflow/2.8.0/Dockerfiles/Dockerfile-dev | 2 +- .../2.8.0/Dockerfiles/Dockerfile-explorer | 2 +- .../2.8.0/Dockerfiles/Dockerfile-explorer-dev | 2 +- .../Dockerfile-explorer-privileged | 2 +- .../Dockerfile-explorer-privileged-dev | 2 +- .../airflow/2.8.0/Dockerfiles/Dockerfile.base | 5 ++- images/airflow/2.8.0/docker-compose.yaml | 38 ++++++++++--------- .../2.8.0/python/mwaa/config/celery.py | 4 +- .../airflow/2.8.0/python/mwaa/config/sqs.py | 24 +++++++++++- .../airflow/2.8.0/python/mwaa/entrypoint.py | 32 ++++++++++++++++ images/airflow/2.8.0/requirements.txt | 12 +++++- 12 files changed, 95 insertions(+), 32 deletions(-) diff --git a/images/airflow/2.8.0/Dockerfiles/Dockerfile b/images/airflow/2.8.0/Dockerfiles/Dockerfile index 0581364..ee70bfa 100644 --- a/images/airflow/2.8.0/Dockerfiles/Dockerfile +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile @@ -3,7 +3,7 @@ # the Jinja2-templated Dockerfile.j2 file, so you need to change that file # instead. # -# This file was generated on 2024-02-12 01:56:33.029839 +# This file was generated on 2024-02-22 19:42:44.935774 # FROM amazon-mwaa/airflow:2.8.0-base diff --git a/images/airflow/2.8.0/Dockerfiles/Dockerfile-dev b/images/airflow/2.8.0/Dockerfiles/Dockerfile-dev index f4c0c5f..0ae295d 100644 --- a/images/airflow/2.8.0/Dockerfiles/Dockerfile-dev +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile-dev @@ -3,7 +3,7 @@ # the Jinja2-templated Dockerfile.j2 file, so you need to change that file # instead. # -# This file was generated on 2024-02-12 01:56:33.021778 +# This file was generated on 2024-02-22 19:42:44.927521 # FROM amazon-mwaa/airflow:2.8.0-base diff --git a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer index 96c6a1f..df2a928 100644 --- a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer @@ -3,7 +3,7 @@ # the Jinja2-templated Dockerfile.j2 file, so you need to change that file # instead. # -# This file was generated on 2024-02-12 01:56:33.032499 +# This file was generated on 2024-02-22 19:42:44.938417 # FROM amazon-mwaa/airflow:2.8.0-base diff --git a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-dev b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-dev index 9d285d8..a229408 100644 --- a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-dev +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-dev @@ -3,7 +3,7 @@ # the Jinja2-templated Dockerfile.j2 file, so you need to change that file # instead. # -# This file was generated on 2024-02-12 01:56:33.024518 +# This file was generated on 2024-02-22 19:42:44.930305 # FROM amazon-mwaa/airflow:2.8.0-base diff --git a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged index 6ef5f4d..dfb4196 100644 --- a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged @@ -3,7 +3,7 @@ # the Jinja2-templated Dockerfile.j2 file, so you need to change that file # instead. # -# This file was generated on 2024-02-12 01:56:33.035092 +# This file was generated on 2024-02-22 19:42:44.941098 # FROM amazon-mwaa/airflow:2.8.0-base diff --git a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged-dev b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged-dev index 1ce50d9..c4676f4 100644 --- a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged-dev +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged-dev @@ -3,7 +3,7 @@ # the Jinja2-templated Dockerfile.j2 file, so you need to change that file # instead. # -# This file was generated on 2024-02-12 01:56:33.027225 +# This file was generated on 2024-02-22 19:42:44.933100 # FROM amazon-mwaa/airflow:2.8.0-base diff --git a/images/airflow/2.8.0/Dockerfiles/Dockerfile.base b/images/airflow/2.8.0/Dockerfiles/Dockerfile.base index 228a8d5..095bd05 100644 --- a/images/airflow/2.8.0/Dockerfiles/Dockerfile.base +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile.base @@ -3,14 +3,15 @@ # the Jinja2-templated Dockerfile.j2 file, so you need to change that file # instead. # -# This file was generated on 2024-02-12 01:56:33.018473 +# This file was generated on 2024-02-22 19:42:44.924226 # FROM public.ecr.aws/amazonlinux/amazonlinux:2023 # Environment variables -# Temporarily downgrading to 2.7.2 to make it easier to test using it internally. +# Temporarily downgrading to 2.7.2 to make it easier to test the Docker image +# within Amazon MWAA since 2.7.2 is a version we support. ENV AIRFLOW_VERSION=2.7.2 ENV AIRFLOW_AMAZON_PROVIDERS_VERSION=8.7.1 diff --git a/images/airflow/2.8.0/docker-compose.yaml b/images/airflow/2.8.0/docker-compose.yaml index 00115e1..4f0b912 100644 --- a/images/airflow/2.8.0/docker-compose.yaml +++ b/images/airflow/2.8.0/docker-compose.yaml @@ -2,7 +2,6 @@ version: "3.8" x-airflow-common: &airflow-common image: amazon-mwaa/airflow:2.8.0 - container_name: mwaa-280-db restart: always environment: # AWS credentials @@ -23,7 +22,10 @@ x-airflow-common: &airflow-common MWAA__DB__POSTGRES_DB: "airflow" # SQS configuration - MWAA__SQS__QUEUE_URL: ${MWAA__SQS__QUEUE_URL} + MWAA__SQS__CREATE_QUEUE: True + MWAA__SQS__CUSTOM_ENDPOINT: http://sqs:9324 + MWAA__SQS__QUEUE_URL: http://sqs:9324/000000000000/celery-queue + MWAA__SQS__USE_SSL: False volumes: - ./dags:/usr/local/airflow/dags @@ -32,6 +34,8 @@ x-airflow-common: &airflow-common depends_on: &airflow-common-depends-on postgres: condition: service_healthy + sqs: + condition: service_healthy services: postgres: @@ -54,22 +58,20 @@ services: expose: - 5432 - # TODO Support a local SQS server to allow the user to use this Docker Compose file without a real AWS account. - # - # sqs: - # image: softwaremill/elasticmq:latest - # healthcheck: - # # https://github.com/softwaremill/elasticmq/issues/776#issuecomment-1582527921 - # test: ["CMD-SHELL", "wget -q -S -O - 127.0.0.1:9324/?Action=ListQueues"] - # interval: 10s - # retries: 5 - # start_period: 5s - # ports: - # - 9324:9324 - # - 9325:9325 - # expose: - # - 9324 - # - 9325 + sqs: + image: softwaremill/elasticmq:latest + healthcheck: + # https://github.com/softwaremill/elasticmq/issues/776#issuecomment-1582527921 + test: ["CMD-SHELL", "wget -q -S -O - 127.0.0.1:9324/?Action=ListQueues"] + interval: 10s + retries: 5 + start_period: 5s + ports: + - 9324:9324 + - 9325:9325 + expose: + - 9324 + - 9325 # TODO Create a local CloudWatch endpoint to allow the customer to use this Docker Compose file without a real AWS account. # TODO Create a local CloudWatch Metrics endpoint to allow the customer to use this Docker Compose file without a real AWS account. diff --git a/images/airflow/2.8.0/python/mwaa/config/celery.py b/images/airflow/2.8.0/python/mwaa/config/celery.py index 588c658..c67a805 100644 --- a/images/airflow/2.8.0/python/mwaa/config/celery.py +++ b/images/airflow/2.8.0/python/mwaa/config/celery.py @@ -7,7 +7,7 @@ # Our import from mwaa.config.aws import get_aws_region -from mwaa.config.sqs import get_sqs_queue_name, get_sqs_queue_url +from mwaa.config.sqs import get_sqs_queue_name, get_sqs_queue_url, should_use_ssl def create_celery_config() -> dict[str, Any]: @@ -23,7 +23,7 @@ def create_celery_config() -> dict[str, Any]: "broker_transport_options": { **celery_config["broker_transport_options"], "predefined_queues": {get_sqs_queue_name(): {"url": get_sqs_queue_url()}}, - "is_secure": True, + "is_secure": should_use_ssl(), "region": get_aws_region(), }, } diff --git a/images/airflow/2.8.0/python/mwaa/config/sqs.py b/images/airflow/2.8.0/python/mwaa/config/sqs.py index 2d04223..926b7bf 100644 --- a/images/airflow/2.8.0/python/mwaa/config/sqs.py +++ b/images/airflow/2.8.0/python/mwaa/config/sqs.py @@ -38,7 +38,7 @@ def _change_protocol_to_sqs(url: str) -> str: ) -def _get_sqs_default_endpoint() -> str: +def get_sqs_default_endpoint() -> str: """ Retrieves the default SQS endpoint for the current AWS region. """ @@ -61,7 +61,7 @@ def get_sqs_endpoint() -> str: used. """ return _change_protocol_to_sqs( - os.environ.get("MWAA__SQS__CUSTOM_ENDPOINT") or _get_sqs_default_endpoint() + os.environ.get("MWAA__SQS__CUSTOM_ENDPOINT") or get_sqs_default_endpoint() ) @@ -108,3 +108,23 @@ def get_sqs_queue_name() -> str: Retrieves the name of the SQS queue specified for use with Celery. """ return _get_queue_name_from_url(get_sqs_queue_url()) + + +def should_create_queue() -> bool: + """ + Determine whether the SQS queue should be created or not. + + :return: True or False. + """ + return os.environ.get("MWAA__SQS__CREATE_QUEUE", "false").lower() == "true" + + +def should_use_ssl() -> bool: + """ + Determines whether to use SSL when communicating with SQS or not. This + configuration is expected to be true when connecting to AWS SQS, and false + when connecting to elasticmq. + + :return: True or False. + """ + return os.environ.get("MWAA__SQS__USE_SSL", "true").lower() == "true" diff --git a/images/airflow/2.8.0/python/mwaa/entrypoint.py b/images/airflow/2.8.0/python/mwaa/entrypoint.py index 843726a..1172630 100644 --- a/images/airflow/2.8.0/python/mwaa/entrypoint.py +++ b/images/airflow/2.8.0/python/mwaa/entrypoint.py @@ -15,12 +15,18 @@ # 3rd party imports +import boto3 +from botocore.exceptions import ClientError from sqlalchemy import create_engine, text from sqlalchemy.engine import Engine # Our imports from mwaa.config.airflow import get_airflow_config from mwaa.config.database import get_db_connection_string +from mwaa.config.sqs import ( + get_sqs_queue_name, + should_create_queue, +) def abort(err_msg: str, exit_code: int = 1) -> None: @@ -129,6 +135,31 @@ def create_www_user(environ: dict[str, str]) -> None: raise RuntimeError(f"Failed to create user. Error: {response.stderr}") +@db_lock(1357) +def create_queue() -> None: + if not should_create_queue(): + return + queue_name = get_sqs_queue_name() + endpoint = os.environ.get("MWAA__SQS__CUSTOM_ENDPOINT") + sqs = boto3.client("sqs", endpoint_url=endpoint) # type: ignore + try: + # Try to get the queue URL to check if it exists + sqs.get_queue_url(QueueName=queue_name)["QueueUrl"] + print(f"Queue {queue_name} already exists.") + except ClientError as e: + # If the queue does not exist, create it + if ( + e.response.get("Error", {}).get("Code") + == "AWS.SimpleQueueService.NonExistentQueue" + ): + response = sqs.create_queue(QueueName=queue_name) + queue_url = response["QueueUrl"] + print(f"Queue created: {queue_url}") + else: + # If there is a different error, raise it + raise e + + def install_user_requirements(environ: dict[str, str]) -> None: requirements_file = environ.get("MWAA__CORE__REQUIREMENTS_PATH") print(f"MWAA__CORE__REQUIREMENTS_PATH = {requirements_file}") @@ -195,6 +226,7 @@ def main() -> None: airflow_db_init(environ) create_www_user(environ) + create_queue() install_user_requirements(environ) # Export the environment variables to .bashrc and .bash_profile to enable diff --git a/images/airflow/2.8.0/requirements.txt b/images/airflow/2.8.0/requirements.txt index 1572cd7..5274e67 100644 --- a/images/airflow/2.8.0/requirements.txt +++ b/images/airflow/2.8.0/requirements.txt @@ -1,5 +1,13 @@ -apache-airflow-providers-amazon[aiobotocore]==8.13.0 -apache-airflow[celery,statsd]==2.8.0 +# This requirements file is used when creating a virtual environment for +# building and developing the Airflow image. It is not to be confused with +# the requirements of Airflow within the image. Still, they are largely similar +# apart from some additional requirements for type checking, e.g. boto3-stubs +# or similar stuff for aiding build and development. +--constraint https://raw.githubusercontent.com/apache/airflow/constraints-2.7.2/constraints-3.11.txt +apache-airflow-providers-amazon[aiobotocore]==8.7.1 +apache-airflow[celery,statsd]==2.7.2 +boto3 +boto3-stubs[essential] celery[sqs] psycopg2 pycurl From 43df46999f10454953f99b46f2fcdab854ec7b81 Mon Sep 17 00:00:00 2001 From: Rafid K Date: Sat, 24 Feb 2024 00:23:31 +0000 Subject: [PATCH 10/11] Enforce docstring using pydocstyle To aim for higher quality of the code, I added pydoctsyle to our quality checks. This will enforce documenting all code. --- .pydocstyle | 2 + create_venvs.py | 30 ++- images/airflow/2.8.0/.pydocstyle | 1 + images/airflow/2.8.0/python/mwaa/__init__.py | 1 + .../2.8.0/python/mwaa/config/__init__.py | 1 + .../2.8.0/python/mwaa/config/airflow.py | 26 ++- .../airflow/2.8.0/python/mwaa/config/aws.py | 10 +- .../2.8.0/python/mwaa/config/celery.py | 9 +- .../2.8.0/python/mwaa/config/database.py | 30 +-- .../airflow/2.8.0/python/mwaa/config/sqs.py | 41 ++-- .../airflow/2.8.0/python/mwaa/entrypoint.py | 111 ++++++++-- images/airflow/2.8.0/requirements.txt | 206 +++++++++++++++++- images/airflow/generate-dockerfiles.py | 1 + quality-checks/lint_python.sh | 10 +- quality-checks/pip_install_check.py | 18 +- quality-checks/run_all.py | 14 +- requirements.txt | 7 +- ruff.toml | 4 + 18 files changed, 416 insertions(+), 106 deletions(-) create mode 100644 .pydocstyle create mode 100644 images/airflow/2.8.0/.pydocstyle create mode 100644 ruff.toml diff --git a/.pydocstyle b/.pydocstyle new file mode 100644 index 0000000..9f91561 --- /dev/null +++ b/.pydocstyle @@ -0,0 +1,2 @@ +[pydocstyle] +match_dir = ^(?!.venv|images).*$ \ No newline at end of file diff --git a/create_venvs.py b/create_venvs.py index 1adf9ed..f128151 100644 --- a/create_venvs.py +++ b/create_venvs.py @@ -1,3 +1,17 @@ +""" +Create the virtual environments required to develop with this package. + +This module should be executed after cloning the repository to create the following +virtual environments: + +- One virtual environment at the root package. +- One per each Docker image + +Those environments are used for many tasks, most importantly allow the IDE to use the +right Python environment for the different folders in this repository. This is necessary +since the Python packages required to develop the different Airflow versions are +different from the packages that we need for the various scripts in this repository. +""" import os import subprocess import sys @@ -14,10 +28,11 @@ def verify_python_version(): def create_venv(path: Path): - """Create a virtual environment in the given directory and install - requirements if `requirements.txt` is present. + """ + Create a venv in the given directory and install requirements if present. - :param dir_path: The path to create the venv in.""" + :param dir_path: The path to create the venv in. + """ venv_path = path / ".venv" if not venv_path.exists(): @@ -31,10 +46,12 @@ def create_venv(path: Path): def pip_install(venv_dir: Path, requirements_file: Path): - """Install dependencies from requirements.txt if it exists. + """ + Install dependencies from requirements.txt if it exists. :param venv_dir: The path to the venv directory. - :param venv_dir: The path to the requirements.txt file.""" + :param venv_dir: The path to the requirements.txt file. + """ if os.path.exists(requirements_file): print(f"Installing dependencies from {requirements_file}...") subprocess.run( @@ -53,10 +70,11 @@ def pip_install(venv_dir: Path, requirements_file: Path): def main(): - """Main entrypoint of the script.""" + """Start execution of the script.""" verify_python_version() project_dirs = [ Path("."), + Path("./images/mockwatch-logs"), *Path("./images").glob("airflow/*"), ] # Include main project dir and each image dir for dir_path in project_dirs: diff --git a/images/airflow/2.8.0/.pydocstyle b/images/airflow/2.8.0/.pydocstyle new file mode 100644 index 0000000..31ab1d2 --- /dev/null +++ b/images/airflow/2.8.0/.pydocstyle @@ -0,0 +1 @@ +[pydocstyle] diff --git a/images/airflow/2.8.0/python/mwaa/__init__.py b/images/airflow/2.8.0/python/mwaa/__init__.py index e69de29..14fae63 100644 --- a/images/airflow/2.8.0/python/mwaa/__init__.py +++ b/images/airflow/2.8.0/python/mwaa/__init__.py @@ -0,0 +1 @@ +"""Initialize the module.""" \ No newline at end of file diff --git a/images/airflow/2.8.0/python/mwaa/config/__init__.py b/images/airflow/2.8.0/python/mwaa/config/__init__.py index e69de29..14fae63 100644 --- a/images/airflow/2.8.0/python/mwaa/config/__init__.py +++ b/images/airflow/2.8.0/python/mwaa/config/__init__.py @@ -0,0 +1 @@ +"""Initialize the module.""" \ No newline at end of file diff --git a/images/airflow/2.8.0/python/mwaa/config/airflow.py b/images/airflow/2.8.0/python/mwaa/config/airflow.py index 934eb10..3962ebd 100644 --- a/images/airflow/2.8.0/python/mwaa/config/airflow.py +++ b/images/airflow/2.8.0/python/mwaa/config/airflow.py @@ -1,3 +1,4 @@ +"""Contain functions for building Airflow configuration.""" from typing import Dict from mwaa.config.database import get_db_connection_string @@ -6,8 +7,9 @@ def get_airflow_db_config() -> Dict[str, str]: """ - Retrieves the environment variables required to set the necessary Airflow - configurations under the "database" section. + Retrieve the environment variables for Airflow's "database" configuration section. + + :returns A dictionary containing the environment variables. """ conn_string = get_db_connection_string() return { @@ -17,11 +19,13 @@ def get_airflow_db_config() -> Dict[str, str]: def get_airflow_celery_config() -> Dict[str, str]: """ - Retrieves the environment variables required to set the necessary Airflow - configurations for using Celery (mostly under the "celery" section, but - other sections as well.) - """ + Retrieve the environment variables required for Celery executor. + + The required environment variables are mostly under the "celery" section, but + other sections as well. + :returns A dictionary containing the environment variables. + """ celery_config_module_path = "mwaa.config.celery.MWAA_CELERY_CONFIG" return { @@ -38,8 +42,9 @@ def get_airflow_celery_config() -> Dict[str, str]: def get_airflow_core_config() -> Dict[str, str]: """ - Retrieves the environment variables required to set the necessary Airflow - configurations under the "core" section. + Retrieve the environment variables for Airflow's "core" configuration section. + + :returns A dictionary containing the environment variables. """ return { "AIRFLOW__CORE__LOAD_EXAMPLES": "False", @@ -48,8 +53,9 @@ def get_airflow_core_config() -> Dict[str, str]: def get_airflow_config() -> Dict[str, str]: """ - Retrieves the environment variables required to set the necessary Airflow - configurations. + Retrieve the environment variables required to set Airflow configurations. + + :returns A dictionary containing the environment variables. """ return { **get_airflow_core_config(), diff --git a/images/airflow/2.8.0/python/mwaa/config/aws.py b/images/airflow/2.8.0/python/mwaa/config/aws.py index 32b5c94..eb0517d 100644 --- a/images/airflow/2.8.0/python/mwaa/config/aws.py +++ b/images/airflow/2.8.0/python/mwaa/config/aws.py @@ -1,11 +1,17 @@ +"""Contain a function for retrieving AWS-related configuration.""" import os def get_aws_region() -> str: """ - Retrieves the AWS region the container should communicate with. This is - assumed to be available in either the AWS_REGION or AWS_DEFAULT_REGION + Retrieve the AWS region the container should communicate with. + + This is assumed to be available in either the AWS_REGION or AWS_DEFAULT_REGION environment variables, checked respectively. + + :returns The AWS region + + :raises RuntimeError if no environment variable for the region is available. """ region = os.environ.get("AWS_REGION") or os.environ.get("AWS_DEFAULT_REGION") if region: diff --git a/images/airflow/2.8.0/python/mwaa/config/celery.py b/images/airflow/2.8.0/python/mwaa/config/celery.py index c67a805..33add1e 100644 --- a/images/airflow/2.8.0/python/mwaa/config/celery.py +++ b/images/airflow/2.8.0/python/mwaa/config/celery.py @@ -1,3 +1,4 @@ +"""Contain functions for retrieving Airflow Celery-related configuration.""" # Python imports import copy from typing import Any @@ -12,10 +13,12 @@ def create_celery_config() -> dict[str, Any]: """ - Generate the configuration that will be passed to Celery. This is used in - the "celery" section of the Airflow configuration. - """ + Generate the configuration that will be passed to Celery. + + This is used in the "celery" section of the Airflow configuration. + :returns A dictionary containing the Celery configuration. + """ # We use Airflow's default condfiguration and make the changes we want. celery_config: dict[str, Any] = copy.deepcopy(DEFAULT_CELERY_CONFIG) celery_config = { diff --git a/images/airflow/2.8.0/python/mwaa/config/database.py b/images/airflow/2.8.0/python/mwaa/config/database.py index fd4faee..adad460 100644 --- a/images/airflow/2.8.0/python/mwaa/config/database.py +++ b/images/airflow/2.8.0/python/mwaa/config/database.py @@ -1,3 +1,4 @@ +"""Contain functions for retrieving Airflow database-related configuration.""" import os import json from operator import itemgetter @@ -6,7 +7,7 @@ def get_db_credentials() -> Tuple[str, str]: """ - Retrieves database credentials from environment variables. + Retrieve database credentials from environment variables. This function looks for database credentials in two possible locations within the environment variables: @@ -23,23 +24,12 @@ def get_db_credentials() -> Tuple[str, str]: If neither method finds the credentials, a RuntimeError is raised indicating the absence of necessary environment variables for database connection. - Returns: - Tuple[str, str]: A tuple containing the PostgreSQL username and password. + :returns Tuple[str, str]: A tuple containing the PostgreSQL username and password. - Raises: - RuntimeError: If neither MWAA__DB__CREDENTIALS nor MWAA__DB__POSTGRES_USER and - MWAA__DB__POSTGRES_PASSWORD environment variables are set, indicating that the - database credentials are not provided. - - Example: - To use this function, ensure that the required environment variables are set in - your environment before calling it. Then, you can retrieve the credentials as - follows: - - >>> user, password = get_db_credentials() - >>> print(f"Username: {user}, Password: {password}") + :raises RuntimeError If neither MWAA__DB__CREDENTIALS nor MWAA__DB__POSTGRES_USER + and MWAA__DB__POSTGRES_PASSWORD environment variables are set, indicating that the + database credentials are not provided. """ - if "MWAA__DB__CREDENTIALS" in os.environ: print("Reading database credentilas from MWAA__DB__CREDENTIALS.") db_secrets = json.loads(os.environ["MWAA__DB__CREDENTIALS"]) @@ -67,10 +57,12 @@ def get_db_credentials() -> Tuple[str, str]: def get_db_connection_string() -> str: """ - Retrieves the connection string to use for communicating with metadata - database. - """ + Retrieve the connection string for communicating with metadata database. + + :returns The connection string. + :raises RuntimeError if the required environment variables are not set. + """ env_vars_names = [ "MWAA__DB__POSTGRES_HOST", "MWAA__DB__POSTGRES_PORT", diff --git a/images/airflow/2.8.0/python/mwaa/config/sqs.py b/images/airflow/2.8.0/python/mwaa/config/sqs.py index 926b7bf..f022745 100644 --- a/images/airflow/2.8.0/python/mwaa/config/sqs.py +++ b/images/airflow/2.8.0/python/mwaa/config/sqs.py @@ -1,3 +1,4 @@ +"""Contain functions for retrieving Airflow SQS-related configuration.""" # Python imports import os from urllib.parse import urlparse, urlunparse @@ -11,14 +12,14 @@ def _change_protocol_to_sqs(url: str) -> str: """ - Make the given SQS endpoint Celery friendly by setting the URL protocol - to sqs://. + Make the given SQS endpoint Celery-friendly by setting the URL protocol to sqs://. Notice that there is no such thing as SQS protocol, but this is the URL convention that Celery uses to understand that the given URL is for an SQS queue. - """ + :returns The Celery-friendly SQS endpoint. + """ parsed_url = urlparse(url) # Check if the scheme was missing and was defaulted to 'http' @@ -40,9 +41,10 @@ def _change_protocol_to_sqs(url: str) -> str: def get_sqs_default_endpoint() -> str: """ - Retrieves the default SQS endpoint for the current AWS region. - """ + Retrieve the default SQS endpoint for the current AWS region. + :returns The endpoint. + """ # Create a session with the specified region session = boto3.Session(region_name=get_aws_region()) @@ -55,10 +57,13 @@ def get_sqs_default_endpoint() -> str: def get_sqs_endpoint() -> str: """ - Retrieves the SQS endpoint to communicate with. The user can specify the - endpoint via the optional `MWAA_CONFIG__CUSTOM_SQS_ENDPOINT` environment - variable. Otherwise, the default endpoint for the current AWS region is + Retrieve the SQS endpoint to communicate with. + + The user can specify the endpoint via the `MWAA_CONFIG__CUSTOM_SQS_ENDPOINT` + environment variable. Otherwise, the default endpoint for the current AWS region is used. + + :returns The SQS endpoint. """ return _change_protocol_to_sqs( os.environ.get("MWAA__SQS__CUSTOM_ENDPOINT") or get_sqs_default_endpoint() @@ -67,11 +72,11 @@ def get_sqs_endpoint() -> str: def _get_queue_name_from_url(queue_url: str) -> str: """ - Extracts the queue name from an Amazon SQS queue URL. + Extract the queue name from an Amazon SQS queue URL. :param queue_url: The URL of the SQS queue. - :return: The name of the queue or None if the URL is invalid. + :returns The name of the queue or None if the URL is invalid. """ try: # Validate the protocol. @@ -92,7 +97,9 @@ def _get_queue_name_from_url(queue_url: str) -> str: def get_sqs_queue_url() -> str: """ - Retrieves the URL of the SQS queue specified for use with Celery. + Retrieve the URL of the SQS queue specified for use with Celery. + + :returns The queue URL. """ env_var_name = "MWAA__SQS__QUEUE_URL" if env_var_name not in os.environ: @@ -105,7 +112,9 @@ def get_sqs_queue_url() -> str: def get_sqs_queue_name() -> str: """ - Retrieves the name of the SQS queue specified for use with Celery. + Retrieve the name of the SQS queue specified for use with Celery. + + :returns The queue name. """ return _get_queue_name_from_url(get_sqs_queue_url()) @@ -121,9 +130,11 @@ def should_create_queue() -> bool: def should_use_ssl() -> bool: """ - Determines whether to use SSL when communicating with SQS or not. This - configuration is expected to be true when connecting to AWS SQS, and false - when connecting to elasticmq. + Determine whether to use SSL when communicating with SQS or not. + + This configuration is expected to be true when connecting to AWS SQS, as it enforces + the use of SQS. On the otherhand, when using elasticmq, which doesn't support SSL, + this should be set to false. :return: True or False. """ diff --git a/images/airflow/2.8.0/python/mwaa/entrypoint.py b/images/airflow/2.8.0/python/mwaa/entrypoint.py index 1172630..922d45f 100644 --- a/images/airflow/2.8.0/python/mwaa/entrypoint.py +++ b/images/airflow/2.8.0/python/mwaa/entrypoint.py @@ -29,7 +29,13 @@ ) -def abort(err_msg: str, exit_code: int = 1) -> None: +def abort(err_msg: str, exit_code: int = 1): + """ + Print an error message and then exit the process with the given exit code. + + :param err_msg: The error message to print before exiting. + :param exit_code: The exit code. + """ print(err_msg) sys.exit(exit_code) @@ -44,15 +50,27 @@ def abort(err_msg: str, exit_code: int = 1) -> None: ] -def verify_versions() -> None: - major, minor, micro, *_ = sys.version_info - assert os.environ["PYTHON_VERSION"] == f"{major}.{minor}.{micro}" - - F = TypeVar("F", bound=Callable[..., Any]) def db_lock(lock_id: int, timeout: int = 300 * 1000) -> Callable[[F], F]: + """ + Generate a decorator that can be used to protect a function by a database lock. + + This is useful when a function needs to be protected against multiple simultaneous + executions. For example, during Airflow database initialization, we want to make + sure that only one process is doing it. Since normal lock mechanisms only apply to + the same process, a database lock becomes a viable solution. + + :param lock_id: A unique ID for the lock. When multiple processes try to use the + same lock ID, only one process will be granted the lock at one time. However, + if the processes have different lock IDs, they will be granted the locks at the + same time. + :param timeout: The maximum time the process is allowed to hold the lock. After this + time expires, the lock is automatically released. + + :returns A decorator that can be applied to a function to protect it with a DB lock. + """ def decorator(func: F) -> F: def wrapper(*args: Any, **kwargs: Any) -> Any: func_name: str = func.__name__ @@ -60,12 +78,14 @@ def wrapper(*args: Any, **kwargs: Any) -> Any: get_db_connection_string() # Assuming this is defined elsewhere ) print(f"Obtaining lock for {func_name}...") - with db_engine.connect() as conn: + with db_engine.connect() as conn: # type: ignore try: - conn.execute( + conn.execute( # type: ignore text("SET LOCK_TIMEOUT to :timeout"), {"timeout": timeout} ) - conn.execute(text("SELECT pg_advisory_lock(:id)"), {"id": lock_id}) + conn.execute( # type: ignore + text("SELECT pg_advisory_lock(:id)"), {"id": lock_id} + ) print(f"Obtained lock for {func_name}.") try: @@ -80,8 +100,10 @@ def wrapper(*args: Any, **kwargs: Any) -> Any: ) finally: print(f"Releasing lock for {func_name}...") - conn.execute(text("SET LOCK_TIMEOUT TO DEFAULT")) - conn.execute( + conn.execute( # type: ignore + text("SET LOCK_TIMEOUT TO DEFAULT") + ) + conn.execute( # type: ignore text("SELECT pg_advisory_unlock(:id)"), {"id": lock_id} ) print(f"Released lock for {func_name}") @@ -92,7 +114,19 @@ def wrapper(*args: Any, **kwargs: Any) -> Any: @db_lock(1234) -def airflow_db_init(environ: dict[str, str]) -> None: +def airflow_db_init(environ: dict[str, str]): + """ + Initialize Airflow database. + + Before Airflow can be used, a call to `airflow db migrate` must be done. This + function does this. This function is called in the entrypoint to make sure that, + for any Airflow component, the database is initialized before it starts. + + This function uses a DB lock to make sure that no two processes execute this + function at the same time. + + :param environ: A dictionary containing the environment variables. + """ print("Calling 'airflow db migrate' to initialize the database.") response = subprocess.run( ["airflow db migrate"], shell=True, check=True, text=True, env=environ @@ -103,7 +137,19 @@ def airflow_db_init(environ: dict[str, str]) -> None: @db_lock(5678) -def create_www_user(environ: dict[str, str]) -> None: +def create_airflow_user(environ: dict[str, str]): + """ + Create the 'airflow' user. + + To be able to login to the webserver, you need a user. This function creates a user + with default credentials. + + Notice that this should only be used in development context. In production, other + means need to be employed to create users with strong passwords. Alternatively, with + MWAA setup, a plugin is employed to integrate with IAM (not implemented yet.) + + :param environ: A dictionary containing the environment variables. + """ print("Calling 'airflow users create' to create the webserver user.") response = subprocess.run( " ".join( @@ -137,6 +183,13 @@ def create_www_user(environ: dict[str, str]) -> None: @db_lock(1357) def create_queue() -> None: + """ + Create the SQS required by Celery. + + In our setup, we use SQS as the backend for Celery. Usually, this should be created + before hand. However, sometimes you might want to create the SQS queue during + startup. One such example is when using the elasticmq server as a mock SQS server. + """ if not should_create_queue(): return queue_name = get_sqs_queue_name() @@ -160,7 +213,18 @@ def create_queue() -> None: raise e -def install_user_requirements(environ: dict[str, str]) -> None: +def install_user_requirements(environ: dict[str, str]): + """ + Install user requirements. + + User requirements should be placed in a requirements.txt file and the environment + variable `MWAA__CORE__REQUIREMENTS_PATH` should be set to the location of that file. + In a Docker Compose setup, you would usually want to create a volume that maps a + requirements.txt file in the host machine somewhere in the container, and then set + the `MWAA__CORE__REQUIREMENTS_PATH` accordingly. + + :param environ: A dictionary containing the environment variables. + """ requirements_file = environ.get("MWAA__CORE__REQUIREMENTS_PATH") print(f"MWAA__CORE__REQUIREMENTS_PATH = {requirements_file}") if requirements_file and os.path.isfile(requirements_file): @@ -177,7 +241,19 @@ def install_user_requirements(environ: dict[str, str]) -> None: print("No user requirements to install.") -def export_env_variables(environ: dict[str, str]) -> None: +def export_env_variables(environ: dict[str, str]): + """ + Export the environment variables to .bashrc and .bash_profile. + + For Aiflow to function properly, a bunch of enviornment variables needs to be + defined, which we do in the entrypoint. However, during development, a need might + arise for bashing into the Docker container and doing some debugging, e.g. running + a bunch of Airflow CLI commands. This won't be possible if the necessary environment + variables are not defined, which is the case unless we have them defined in the + .bashrc/.bash_profile files. This function does exactly that. + + :param environ: A dictionary containing the environment variables to export. + """ # Get the home directory of the current user home_dir = os.path.expanduser("~") bashrc_path = os.path.join(home_dir, ".bashrc") @@ -200,8 +276,7 @@ def export_env_variables(environ: dict[str, str]) -> None: def main() -> None: - """Entrypoint of the script.""" - + """Start execution of the script.""" try: ( _, @@ -225,7 +300,7 @@ def main() -> None: environ = {**os.environ, **get_airflow_config()} airflow_db_init(environ) - create_www_user(environ) + create_airflow_user(environ) create_queue() install_user_requirements(environ) diff --git a/images/airflow/2.8.0/requirements.txt b/images/airflow/2.8.0/requirements.txt index 5274e67..e4d7fea 100644 --- a/images/airflow/2.8.0/requirements.txt +++ b/images/airflow/2.8.0/requirements.txt @@ -2,16 +2,198 @@ # building and developing the Airflow image. It is not to be confused with # the requirements of Airflow within the image. Still, they are largely similar # apart from some additional requirements for type checking, e.g. boto3-stubs -# or similar stuff for aiding build and development. +# or similar stuff for aiding build and development, e.g. pydocstyle. --constraint https://raw.githubusercontent.com/apache/airflow/constraints-2.7.2/constraints-3.11.txt -apache-airflow-providers-amazon[aiobotocore]==8.7.1 -apache-airflow[celery,statsd]==2.7.2 -boto3 -boto3-stubs[essential] -celery[sqs] -psycopg2 -pycurl -pyright -ruff -sqlalchemy-stubs -watchtower \ No newline at end of file +aiobotocore==2.6.0 +aiohttp==3.8.6 +aioitertools==0.11.0 +aiosignal==1.3.1 +alembic==1.12.0 +amqp==5.1.1 +annotated-types==0.6.0 +anyio==4.0.0 +apache-airflow==2.7.2 +apache-airflow-providers-amazon==8.7.1 +apache-airflow-providers-celery==3.3.4 +apache-airflow-providers-common-sql==1.7.2 +apache-airflow-providers-ftp==3.5.2 +apache-airflow-providers-http==4.5.2 +apache-airflow-providers-imap==3.3.2 +apache-airflow-providers-sqlite==3.4.3 +apispec==6.3.0 +argcomplete==3.1.2 +asgiref==3.7.2 +asn1crypto==1.5.1 +async-timeout==4.0.3 +attrs==23.1.0 +Babel==2.13.0 +backoff==1.10.0 +beautifulsoup4==4.12.2 +billiard==4.1.0 +blinker==1.6.3 +boto3==1.28.17 +boto3-stubs==1.28.85 +botocore==1.31.17 +botocore-stubs==1.34.41 +cachelib==0.9.0 +cattrs==23.1.2 +celery==5.3.4 +certifi==2023.7.22 +cffi==1.16.0 +charset-normalizer==3.3.0 +click==8.1.7 +click-didyoumean==0.3.0 +click-plugins==1.1.1 +click-repl==0.3.0 +clickclick==20.10.2 +colorama==0.4.6 +colorlog==4.8.0 +ConfigUpdater==3.1.1 +connexion==2.14.2 +cron-descriptor==1.4.0 +croniter==1.4.1 +cryptography==41.0.4 +Deprecated==1.2.14 +dill==0.3.1.1 +dnspython==2.4.2 +docutils==0.20.1 +email-validator==1.3.1 +Flask==2.2.5 +Flask-AppBuilder==4.3.6 +Flask-Babel==2.0.0 +Flask-Caching==2.0.2 +Flask-JWT-Extended==4.5.3 +Flask-Limiter==3.5.0 +Flask-Login==0.6.2 +Flask-Session==0.5.0 +Flask-SQLAlchemy==2.5.1 +Flask-WTF==1.2.1 +flower==2.0.1 +frozenlist==1.4.0 +fsspec==2023.12.2 +google-re2==1.1 +googleapis-common-protos==1.60.0 +graphviz==0.20.1 +greenlet==3.0.0 +grpcio==1.59.0 +gunicorn==21.2.0 +h11==0.14.0 +httpcore==0.16.3 +httpx==0.23.3 +humanize==4.8.0 +idna==3.4 +importlib-metadata==6.8.0 +importlib-resources==6.1.0 +inflection==0.5.1 +itsdangerous==2.1.2 +Jinja2==3.1.2 +jmespath==0.10.0 +jsonpath-ng==1.6.0 +jsonschema==4.19.1 +jsonschema-specifications==2023.7.1 +kombu==5.3.2 +lazy-object-proxy==1.9.0 +limits==3.6.0 +linkify-it-py==2.0.2 +lockfile==0.12.2 +lxml==4.9.3 +Mako==1.2.4 +Markdown==3.5 +markdown-it-py==3.0.0 +MarkupSafe==2.1.3 +marshmallow==3.20.1 +marshmallow-oneofschema==3.0.1 +marshmallow-sqlalchemy==0.26.1 +mdit-py-plugins==0.4.0 +mdurl==0.1.2 +multidict==6.0.4 +mypy==1.2.0 +mypy-boto3-cloudformation==1.28.83 +mypy-boto3-dynamodb==1.28.73 +mypy-boto3-ec2==1.28.85 +mypy-boto3-lambda==1.28.83 +mypy-boto3-rds==1.28.61 +mypy-boto3-s3==1.28.55 +mypy-boto3-sqs==1.28.82 +mypy-extensions==1.0.0 +nodeenv==1.8.0 +opentelemetry-api==1.20.0 +opentelemetry-exporter-otlp==1.20.0 +opentelemetry-exporter-otlp-proto-common==1.20.0 +opentelemetry-exporter-otlp-proto-grpc==1.20.0 +opentelemetry-exporter-otlp-proto-http==1.20.0 +opentelemetry-proto==1.20.0 +opentelemetry-sdk==1.20.0 +opentelemetry-semantic-conventions==0.41b0 +ordered-set==4.1.0 +packaging==23.2 +pathspec==0.11.2 +pendulum==2.1.2 +pluggy==1.3.0 +ply==3.11 +prison==0.2.1 +prometheus-client==0.17.1 +prompt-toolkit==3.0.39 +protobuf==4.21.12 +psutil==5.9.5 +psycopg2==2.9.9 +pycparser==2.21 +pycurl==7.45.3 +pydantic==2.4.2 +pydantic_core==2.10.1 +pydocstyle==6.3.0 +Pygments==2.16.1 +PyJWT==2.8.0 +pyright==1.1.351 +python-daemon==3.0.1 +python-dateutil==2.8.2 +python-nvd3==0.15.0 +python-slugify==8.0.1 +pytz==2023.3.post1 +pytzdata==2020.1 +PyYAML==6.0.1 +redshift-connector==2.0.914 +referencing==0.30.2 +requests==2.31.0 +requests-toolbelt==1.0.0 +rfc3339-validator==0.1.4 +rfc3986==1.5.0 +rich==13.6.0 +rich-argparse==1.3.0 +rpds-py==0.10.4 +ruff==0.0.292 +s3transfer==0.6.2 +scramp==1.4.4 +setproctitle==1.3.3 +six==1.16.0 +sniffio==1.3.0 +snowballstemmer==2.2.0 +soupsieve==2.5 +SQLAlchemy==1.4.49 +SQLAlchemy-JSONField==1.0.1.post0 +sqlalchemy-redshift==0.8.14 +sqlalchemy-stubs==0.4 +SQLAlchemy-Utils==0.41.1 +sqlparse==0.4.4 +statsd==4.0.1 +tabulate==0.9.0 +tenacity==8.2.3 +termcolor==2.3.0 +text-unidecode==1.3 +tornado==6.3.3 +types-awscrt==0.20.3 +types-s3transfer==0.10.0 +typing_extensions==4.8.0 +tzdata==2023.3 +uc-micro-py==1.0.2 +unicodecsv==0.14.1 +universal-pathlib==0.1.4 +urllib3==1.26.17 +vine==5.0.0 +watchtower==2.0.1 +wcwidth==0.2.8 +Werkzeug==2.2.3 +wrapt==1.15.0 +WTForms==3.0.1 +yarl==1.9.2 +zipp==3.17.0 diff --git a/images/airflow/generate-dockerfiles.py b/images/airflow/generate-dockerfiles.py index c91ce58..5dae6e2 100644 --- a/images/airflow/generate-dockerfiles.py +++ b/images/airflow/generate-dockerfiles.py @@ -192,6 +192,7 @@ def generate_airflow_dockerfiles(image_root_dir: Path): def main(): + """Start execution of the script.""" for x in Path(__file__).parent.iterdir(): if not x.is_dir(): continue diff --git a/quality-checks/lint_python.sh b/quality-checks/lint_python.sh index 039440d..d613022 100755 --- a/quality-checks/lint_python.sh +++ b/quality-checks/lint_python.sh @@ -10,6 +10,8 @@ if [[ "$PWD" != "$REPO_ROOT" ]]; then exit 1 fi +status=0 + check_dir() { local dir=$1 # Directory to work in local venv_dir="${dir}/.venv" # virtual environment path @@ -26,9 +28,11 @@ check_dir() { source "${venv_dir}/bin/activate" # Run ruff and Pyright echo "Running ruff..." - ruff "${dir}" + ruff "${dir}" || status=1 echo "Running Pyright..." - pyright "${dir}" + pyright "${dir}" || status=1 + echo "Running pydocstyle..." + pydocstyle "${dir}" || status=1 deactivate echo @@ -43,3 +47,5 @@ for image_dir in ./images/airflow/*; do check_dir "$image_dir" fi done + +exit $status \ No newline at end of file diff --git a/quality-checks/pip_install_check.py b/quality-checks/pip_install_check.py index d7fbc0e..0a8c213 100755 --- a/quality-checks/pip_install_check.py +++ b/quality-checks/pip_install_check.py @@ -1,4 +1,11 @@ #!/bin/python3 +""" +This module verifies there are no direct use of "pip install" in the code. + +Direct use of "pip install" could easily result in broken Airflow dependencies. As such, +we always want to use a special script, safe-pip-install, which ensure Airflow and its +dependencies are protected. +""" import os import sys @@ -9,7 +16,7 @@ def check_file_for_pip_install(filepath: str) -> bool: """ - Checks if the file contains 'pip install'. + Check if the file contains 'pip install'. :param filepath: The path of the file to check. @@ -24,8 +31,7 @@ def check_file_for_pip_install(filepath: str) -> bool: def verify_no_pip_install(directory: str) -> bool: """ - Recursively searches through the directory tree and verifies that there - are no direct use of `pip install`. + Verify there is no direct use of `pip install` in the directory tree. :param directory: The directory to scan. @@ -52,10 +58,7 @@ def verify_no_pip_install(directory: str) -> bool: def verify_in_repo_root() -> None: - """ - Verifies that the script is being executed from within the repository - root. Exits with non-zero code if that's not the case. - """ + """Verify the script is executed from the repository root, or exit with non-zero.""" # Determine the script's directory and the parent directory (which should # be ) script_dir = os.path.dirname(os.path.realpath(__file__)) @@ -72,6 +75,7 @@ def verify_in_repo_root() -> None: def main() -> None: + """Start execution of the script.""" verify_in_repo_root() if verify_no_pip_install("./"): diff --git a/quality-checks/run_all.py b/quality-checks/run_all.py index ff6b7fc..b92f64e 100755 --- a/quality-checks/run_all.py +++ b/quality-checks/run_all.py @@ -1,4 +1,5 @@ #!/bin/python3 +"""Run all quality check scripts under the quality-checks/ folder.""" import os import subprocess import sys @@ -11,9 +12,7 @@ # TODO Remove support of Amazon Linux 2 from this package as soon as possible, as we # shouldn't be relying on an EOLed Python version. def prefix_output(file: str, process: subprocess.Popen) -> None: # type: ignore - """ - Prefix each line of output with the filename. - """ + """Prefix each line of output with the filename.""" if not process.stdout: # type: ignore raise RuntimeError("Process doesn't have an stdout stream.") for line in process.stdout: # type: ignore @@ -21,10 +20,7 @@ def prefix_output(file: str, process: subprocess.Popen) -> None: # type: ignore def verify_in_repo_root() -> None: - """ - Verifies that the script is being executed from within the repository - root. Exits with non-zero code if that's not the case. - """ + """Verify the script is executed from the repository root, or exit with non-zero.""" # Determine the script's directory and the parent directory (which should # be ) script_dir = os.path.dirname(os.path.realpath(__file__)) @@ -41,9 +37,7 @@ def verify_in_repo_root() -> None: def main() -> None: - """ - Script entrypoint. - """ + """Start execution of the script.""" verify_in_repo_root() quality_checks_dir = "./quality-checks/" diff --git a/requirements.txt b/requirements.txt index 511709c..5186cf1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,10 @@ Jinja2==3.1.3 MarkupSafe==2.1.5 nodeenv==1.8.0 -pyright==1.1.349 +pip==24.0 +pydocstyle==6.3.0 +pyright==1.1.351 ruff==0.2.0 setuptools==68.2.2 -wheel==0.41.2 \ No newline at end of file +snowballstemmer==2.2.0 +wheel==0.41.2 diff --git a/ruff.toml b/ruff.toml new file mode 100644 index 0000000..216355c --- /dev/null +++ b/ruff.toml @@ -0,0 +1,4 @@ +exclude = [ + ".venv", + "images" +] \ No newline at end of file From dcb7582825ad74dc11a8db0923305cf58ddbd5b5 Mon Sep 17 00:00:00 2001 From: Rafid K Date: Fri, 19 Apr 2024 17:00:48 +0000 Subject: [PATCH 11/11] Addressing feedback on PR #41 + other minor changes * Checked for major version in verify_python_version * More documentation in `generate_base_dockerfile` * Bumped version to 2.9.0 * Support passing SSL mode for Postgres connection. * Downgraded to Python 3.11.9 since we don't want to go to Python 3.12 before sufficient adoption. * Remove version pinning for Amazon providers since this is covered by the Airflow constraints file. * Update the `requirements.txt` used for development. Removed all but the requirements we want, and left the rest for pip to intsall automatically. This makes updating the file easier. * `db_lock` method: renamed `timeout` to `timeout_ms` for clarity. * Check for both `pip install` and `pip3 install` in `pip_install_check.py`. * Support an allowlist in `pip_install_check.py` in case some scripts need to use `pip install` directly, e.g. the script to install Python since it needs to update `pip`. --- .github/workflows/quality-checks.yaml | 12 +- create_venvs.py | 7 +- images/airflow/2.8.0/Dockerfile.base.j2 | 11 +- .../airflow/2.8.0/Dockerfile.derivatives.j2 | 2 +- images/airflow/2.8.0/Dockerfiles/Dockerfile | 4 +- .../airflow/2.8.0/Dockerfiles/Dockerfile-dev | 4 +- .../2.8.0/Dockerfiles/Dockerfile-explorer | 4 +- .../2.8.0/Dockerfiles/Dockerfile-explorer-dev | 4 +- .../Dockerfile-explorer-privileged | 4 +- .../Dockerfile-explorer-privileged-dev | 4 +- .../airflow/2.8.0/Dockerfiles/Dockerfile.base | 13 +- .../2.8.0/bin/airflow-user/safe-pip-install | 2 +- images/airflow/2.8.0/build.sh | 4 +- images/airflow/2.8.0/docker-compose.yaml | 3 +- images/airflow/2.8.0/explore-image.sh | 2 +- images/airflow/2.8.0/install_pip_packages.sh | 16 -- .../2.8.0/python/mwaa/config/airflow.py | 2 +- .../2.8.0/python/mwaa/config/database.py | 7 +- .../airflow/2.8.0/python/mwaa/config/sqs.py | 4 +- .../airflow/2.8.0/python/mwaa/entrypoint.py | 12 +- images/airflow/2.8.0/requirements.txt | 215 ++---------------- images/airflow/2.8.0/run.sh.template | 2 +- images/airflow/generate-dockerfiles.py | 12 +- quality-checks/pip_install_check.py | 39 ++-- 24 files changed, 111 insertions(+), 278 deletions(-) delete mode 100755 images/airflow/2.8.0/install_pip_packages.sh diff --git a/.github/workflows/quality-checks.yaml b/.github/workflows/quality-checks.yaml index 35b721c..9e00815 100644 --- a/.github/workflows/quality-checks.yaml +++ b/.github/workflows/quality-checks.yaml @@ -17,8 +17,8 @@ jobs: # gcc, libcurl-devel: For compiling pycurl (required for our Airflow setup.) # gzip: Requiring by actions/checkout@v2 to gunzip the source code. # postgresql-devel: Required for our Airflow setup. - # python3-devel: Required for building some Python modules, e.g. pycurl. - # python3: Self explanatory. + # python3.11-devel: Required for building some Python modules, e.g. pycurl. + # python3.11: Self explanatory. # tar, wget, xz: For downloading and extracting ShellCheck dnf update -y dnf install -y \ @@ -26,8 +26,8 @@ jobs: gzip \ libcurl-devel \ postgresql-devel \ - python3 \ - python3-devel \ + python3.11 \ + python3.11-devel \ tar \ wget \ xz @@ -40,7 +40,7 @@ jobs: uses: actions/checkout@v2 - name: Create the necessary Python virtual environments... - run: python3 ./create_venvs.py + run: python3.11 ./create_venvs.py - name: Run quality checks... - run: python3 ./quality-checks/run_all.py + run: python3.11 ./quality-checks/run_all.py diff --git a/create_venvs.py b/create_venvs.py index f128151..07d8008 100644 --- a/create_venvs.py +++ b/create_venvs.py @@ -21,9 +21,10 @@ def verify_python_version(): """Check if the current Python version is at least 3.9.""" - _major, minor, *_ = sys.version_info - if minor < 9: - print("Python 3.9 or higher is required.") + major, minor, *_ = sys.version_info + + if major != 3 or minor < 11: + print("Python 3.11 or higher is required.") sys.exit(1) diff --git a/images/airflow/2.8.0/Dockerfile.base.j2 b/images/airflow/2.8.0/Dockerfile.base.j2 index 0cbd167..f053fe9 100644 --- a/images/airflow/2.8.0/Dockerfile.base.j2 +++ b/images/airflow/2.8.0/Dockerfile.base.j2 @@ -2,23 +2,20 @@ FROM public.ecr.aws/amazonlinux/amazonlinux:2023 # Environment variables -# Temporarily downgrading to 2.7.2 to make it easier to test the Docker image -# within Amazon MWAA since 2.7.2 is a version we support. -ENV AIRFLOW_VERSION=2.7.2 -ENV AIRFLOW_AMAZON_PROVIDERS_VERSION=8.7.1 +ENV AIRFLOW_VERSION=2.9.0 -ENV AIRFLOW_CONSTRAINTS_FILE="https://raw.githubusercontent.com/apache/airflow/constraints-2.7.2/constraints-3.11.txt" +ENV AIRFLOW_CONSTRAINTS_FILE="https://raw.githubusercontent.com/apache/airflow/constraints-2.9.0/constraints-3.11.txt" ENV AIRFLOW_USER_HOME=/usr/local/airflow ENV AIRFLOW_HOME=${AIRFLOW_USER_HOME} ENV MWAA_HOME=/usr/local/mwaa -ENV PYTHON_VERSION=3.11.7 +ENV PYTHON_VERSION=3.11.9 # We don't want those variables to stay in the final image, so we use ARG instead of ENV. ARG AIRFLOW_USER_LOCAL_PATH=${AIRFLOW_USER_HOME}/.local ARG AIRFLOW_USER_LOCAL_BIN_PATH=${AIRFLOW_USER_LOCAL_PATH}/bin ARG PATH_DEFAULT=${PATH} ARG PATH_AIRFLOW_USER=${AIRFLOW_USER_LOCAL_BIN_PATH}:${PATH_DEFAULT} -ARG PYTHON_MD5_CHECKSUM=d96c7e134c35a8c46236f8a0e566b69c +ARG PYTHON_MD5_CHECKSUM=22ea467e7d915477152e99d5da856ddc ARG MARIADB_DOWNLOAD_BASE_URL=https://mirror.mariadb.org/yum/11.1/fedora38-amd64/rpms ARG MARIADB_RPM_COMMON=MariaDB-common-11.1.2-1.fc38.x86_64.rpm ARG MARIADB_RPM_COMMON_CHECKSUM=e87371d558efa97724f3728fb214cf19 diff --git a/images/airflow/2.8.0/Dockerfile.derivatives.j2 b/images/airflow/2.8.0/Dockerfile.derivatives.j2 index b499484..5999882 100644 --- a/images/airflow/2.8.0/Dockerfile.derivatives.j2 +++ b/images/airflow/2.8.0/Dockerfile.derivatives.j2 @@ -1,4 +1,4 @@ -FROM amazon-mwaa/airflow:2.8.0-base +FROM amazon-mwaa/airflow:2.9.0-base {% if bootstrapping_scripts_dev %} diff --git a/images/airflow/2.8.0/Dockerfiles/Dockerfile b/images/airflow/2.8.0/Dockerfiles/Dockerfile index ee70bfa..ac1909d 100644 --- a/images/airflow/2.8.0/Dockerfiles/Dockerfile +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile @@ -3,10 +3,10 @@ # the Jinja2-templated Dockerfile.j2 file, so you need to change that file # instead. # -# This file was generated on 2024-02-22 19:42:44.935774 +# This file was generated on 2024-04-19 02:30:48.359587 # -FROM amazon-mwaa/airflow:2.8.0-base +FROM amazon-mwaa/airflow:2.9.0-base USER airflow diff --git a/images/airflow/2.8.0/Dockerfiles/Dockerfile-dev b/images/airflow/2.8.0/Dockerfiles/Dockerfile-dev index 0ae295d..09a91d6 100644 --- a/images/airflow/2.8.0/Dockerfiles/Dockerfile-dev +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile-dev @@ -3,10 +3,10 @@ # the Jinja2-templated Dockerfile.j2 file, so you need to change that file # instead. # -# This file was generated on 2024-02-22 19:42:44.927521 +# This file was generated on 2024-04-19 02:30:48.351336 # -FROM amazon-mwaa/airflow:2.8.0-base +FROM amazon-mwaa/airflow:2.9.0-base # Copy bootstrapping files. COPY ./bootstrap-dev /bootstrap-dev diff --git a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer index df2a928..3e65a89 100644 --- a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer @@ -3,10 +3,10 @@ # the Jinja2-templated Dockerfile.j2 file, so you need to change that file # instead. # -# This file was generated on 2024-02-22 19:42:44.938417 +# This file was generated on 2024-04-19 02:30:48.362267 # -FROM amazon-mwaa/airflow:2.8.0-base +FROM amazon-mwaa/airflow:2.9.0-base USER airflow diff --git a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-dev b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-dev index a229408..31f0c48 100644 --- a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-dev +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-dev @@ -3,10 +3,10 @@ # the Jinja2-templated Dockerfile.j2 file, so you need to change that file # instead. # -# This file was generated on 2024-02-22 19:42:44.930305 +# This file was generated on 2024-04-19 02:30:48.354110 # -FROM amazon-mwaa/airflow:2.8.0-base +FROM amazon-mwaa/airflow:2.9.0-base # Copy bootstrapping files. COPY ./bootstrap-dev /bootstrap-dev diff --git a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged index dfb4196..bd75226 100644 --- a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged @@ -3,10 +3,10 @@ # the Jinja2-templated Dockerfile.j2 file, so you need to change that file # instead. # -# This file was generated on 2024-02-22 19:42:44.941098 +# This file was generated on 2024-04-19 02:30:48.364902 # -FROM amazon-mwaa/airflow:2.8.0-base +FROM amazon-mwaa/airflow:2.9.0-base USER root diff --git a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged-dev b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged-dev index c4676f4..2fadd2d 100644 --- a/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged-dev +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile-explorer-privileged-dev @@ -3,10 +3,10 @@ # the Jinja2-templated Dockerfile.j2 file, so you need to change that file # instead. # -# This file was generated on 2024-02-22 19:42:44.933100 +# This file was generated on 2024-04-19 02:30:48.356865 # -FROM amazon-mwaa/airflow:2.8.0-base +FROM amazon-mwaa/airflow:2.9.0-base # Copy bootstrapping files. COPY ./bootstrap-dev /bootstrap-dev diff --git a/images/airflow/2.8.0/Dockerfiles/Dockerfile.base b/images/airflow/2.8.0/Dockerfiles/Dockerfile.base index 095bd05..6bbe400 100644 --- a/images/airflow/2.8.0/Dockerfiles/Dockerfile.base +++ b/images/airflow/2.8.0/Dockerfiles/Dockerfile.base @@ -3,30 +3,27 @@ # the Jinja2-templated Dockerfile.j2 file, so you need to change that file # instead. # -# This file was generated on 2024-02-22 19:42:44.924226 +# This file was generated on 2024-04-19 02:30:48.348008 # FROM public.ecr.aws/amazonlinux/amazonlinux:2023 # Environment variables -# Temporarily downgrading to 2.7.2 to make it easier to test the Docker image -# within Amazon MWAA since 2.7.2 is a version we support. -ENV AIRFLOW_VERSION=2.7.2 -ENV AIRFLOW_AMAZON_PROVIDERS_VERSION=8.7.1 +ENV AIRFLOW_VERSION=2.9.0 -ENV AIRFLOW_CONSTRAINTS_FILE="https://raw.githubusercontent.com/apache/airflow/constraints-2.7.2/constraints-3.11.txt" +ENV AIRFLOW_CONSTRAINTS_FILE="https://raw.githubusercontent.com/apache/airflow/constraints-2.9.0/constraints-3.11.txt" ENV AIRFLOW_USER_HOME=/usr/local/airflow ENV AIRFLOW_HOME=${AIRFLOW_USER_HOME} ENV MWAA_HOME=/usr/local/mwaa -ENV PYTHON_VERSION=3.11.7 +ENV PYTHON_VERSION=3.11.9 # We don't want those variables to stay in the final image, so we use ARG instead of ENV. ARG AIRFLOW_USER_LOCAL_PATH=${AIRFLOW_USER_HOME}/.local ARG AIRFLOW_USER_LOCAL_BIN_PATH=${AIRFLOW_USER_LOCAL_PATH}/bin ARG PATH_DEFAULT=${PATH} ARG PATH_AIRFLOW_USER=${AIRFLOW_USER_LOCAL_BIN_PATH}:${PATH_DEFAULT} -ARG PYTHON_MD5_CHECKSUM=d96c7e134c35a8c46236f8a0e566b69c +ARG PYTHON_MD5_CHECKSUM=22ea467e7d915477152e99d5da856ddc ARG MARIADB_DOWNLOAD_BASE_URL=https://mirror.mariadb.org/yum/11.1/fedora38-amd64/rpms ARG MARIADB_RPM_COMMON=MariaDB-common-11.1.2-1.fc38.x86_64.rpm ARG MARIADB_RPM_COMMON_CHECKSUM=e87371d558efa97724f3728fb214cf19 diff --git a/images/airflow/2.8.0/bin/airflow-user/safe-pip-install b/images/airflow/2.8.0/bin/airflow-user/safe-pip-install index 7fb324c..8adf703 100644 --- a/images/airflow/2.8.0/bin/airflow-user/safe-pip-install +++ b/images/airflow/2.8.0/bin/airflow-user/safe-pip-install @@ -2,7 +2,7 @@ # Define an array of required packages REQUIRED_PACKAGES=( - "apache-airflow-providers-amazon[aiobotocore]==${AIRFLOW_AMAZON_PROVIDERS_VERSION}" + "apache-airflow-providers-amazon[aiobotocore]" "apache-airflow[celery,statsd]==${AIRFLOW_VERSION}" "celery[sqs]" psycopg2 diff --git a/images/airflow/2.8.0/build.sh b/images/airflow/2.8.0/build.sh index 3e7a8cb..8625377 100755 --- a/images/airflow/2.8.0/build.sh +++ b/images/airflow/2.8.0/build.sh @@ -8,13 +8,13 @@ python3 ../generate-dockerfiles.py deactivate # Build the base image. -docker build -f ./Dockerfiles/Dockerfile.base -t amazon-mwaa/airflow:2.8.0-base ./ +docker build -f ./Dockerfiles/Dockerfile.base -t amazon-mwaa/airflow:2.9.0-base ./ # Build the derivatives. for dev in "True" "False"; do for build_type in "standard" "explorer" "explorer-privileged"; do dockerfile_name="Dockerfile" - tag_name="amazon-mwaa/airflow:2.8.0" + tag_name="amazon-mwaa/airflow:2.9.0" if [[ "$build_type" != "standard" ]]; then dockerfile_name="${dockerfile_name}-${build_type}" diff --git a/images/airflow/2.8.0/docker-compose.yaml b/images/airflow/2.8.0/docker-compose.yaml index 4f0b912..d411111 100644 --- a/images/airflow/2.8.0/docker-compose.yaml +++ b/images/airflow/2.8.0/docker-compose.yaml @@ -1,7 +1,7 @@ version: "3.8" x-airflow-common: &airflow-common - image: amazon-mwaa/airflow:2.8.0 + image: amazon-mwaa/airflow:2.9.0 restart: always environment: # AWS credentials @@ -20,6 +20,7 @@ x-airflow-common: &airflow-common MWAA__DB__POSTGRES_USER: "airflow" MWAA__DB__POSTGRES_PASSWORD: "airflow" MWAA__DB__POSTGRES_DB: "airflow" + MWAA__DB__POSTGRES_SSLMODE: "prefer" # SQS configuration MWAA__SQS__CREATE_QUEUE: True diff --git a/images/airflow/2.8.0/explore-image.sh b/images/airflow/2.8.0/explore-image.sh index 6e316d3..5792fd3 100755 --- a/images/airflow/2.8.0/explore-image.sh +++ b/images/airflow/2.8.0/explore-image.sh @@ -1,2 +1,2 @@ #!/bin/bash -docker container run -it amazon-mwaa/airflow:2.8.0-explorer-dev +docker container run -it amazon-mwaa/airflow:2.9.0-explorer-dev diff --git a/images/airflow/2.8.0/install_pip_packages.sh b/images/airflow/2.8.0/install_pip_packages.sh deleted file mode 100755 index ec8dd18..0000000 --- a/images/airflow/2.8.0/install_pip_packages.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -AIRFLOW_AMAZON_PROVIDERS_VERSION=8.13.0 -AIRFLOW_VERSION=2.8.0 -PYTHON_MAJOR_MINOR_VERSION=3.11 - -CONSTRAINT_FILE="https://raw.githubusercontent.com/apache/airflow/constraints-${AIRFLOW_VERSION}/constraints-${PYTHON_MAJOR_MINOR_VERSION}.txt" -pip3 install --constraint "${CONSTRAINT_FILE}" \ - autopep8 \ - jinja2 \ - pycurl \ - psycopg2 \ - "celery[sqs]" \ - "apache-airflow[celery,statsd]==${AIRFLOW_VERSION}" \ - "apache-airflow-providers-amazon[aiobotocore]==${AIRFLOW_AMAZON_PROVIDERS_VERSION}" \ - watchtower \ No newline at end of file diff --git a/images/airflow/2.8.0/python/mwaa/config/airflow.py b/images/airflow/2.8.0/python/mwaa/config/airflow.py index 3962ebd..2ec0318 100644 --- a/images/airflow/2.8.0/python/mwaa/config/airflow.py +++ b/images/airflow/2.8.0/python/mwaa/config/airflow.py @@ -34,8 +34,8 @@ def get_airflow_celery_config() -> Dict[str, str]: "AIRFLOW__CELERY__CELERY_CONFIG_OPTIONS": celery_config_module_path, "AIRFLOW__CELERY__RESULT_BACKEND": f"db+{get_db_connection_string()}", "AIRFLOW__CELERY__WORKER_ENABLE_REMOTE_CONTROL": "False", - "AIRFLOW__CORE__EXECUTOR": "CeleryExecutor", # Not a Celery config per-se, but is used by the Celery executor. + "AIRFLOW__CORE__EXECUTOR": "CeleryExecutor", "AIRFLOW__OPERATORS__DEFAULT_QUEUE": get_sqs_queue_name(), } diff --git a/images/airflow/2.8.0/python/mwaa/config/database.py b/images/airflow/2.8.0/python/mwaa/config/database.py index adad460..76bdb27 100644 --- a/images/airflow/2.8.0/python/mwaa/config/database.py +++ b/images/airflow/2.8.0/python/mwaa/config/database.py @@ -67,12 +67,14 @@ def get_db_connection_string() -> str: "MWAA__DB__POSTGRES_HOST", "MWAA__DB__POSTGRES_PORT", "MWAA__DB__POSTGRES_DB", + "MWAA__DB__POSTGRES_SSLMODE", ] try: ( postgres_host, postgres_port, postgres_db, + postgres_sslmode, ) = itemgetter(*env_vars_names)(os.environ) (postgres_user, postgres_password) = get_db_credentials() except Exception as e: @@ -84,8 +86,11 @@ def get_db_connection_string() -> str: f"following exception: {e}" ) + if not postgres_sslmode: + postgres_sslmode = 'require' + protocol = "postgresql+psycopg2" creds = f"{postgres_user}:{postgres_password}" addr = f"{postgres_host}:{postgres_port}" # TODO We need to do what is the necessary to enforce 'require'. - return f"{protocol}://{creds}@{addr}/{postgres_db}?sslmode=prefer" + return f"{protocol}://{creds}@{addr}/{postgres_db}?sslmode={postgres_sslmode}" diff --git a/images/airflow/2.8.0/python/mwaa/config/sqs.py b/images/airflow/2.8.0/python/mwaa/config/sqs.py index f022745..b9be186 100644 --- a/images/airflow/2.8.0/python/mwaa/config/sqs.py +++ b/images/airflow/2.8.0/python/mwaa/config/sqs.py @@ -79,7 +79,9 @@ def _get_queue_name_from_url(queue_url: str) -> str: :returns The name of the queue or None if the URL is invalid. """ try: - # Validate the protocol. + # Validate the protocol (to flag accidentally passing of sqs:// + # protocol which is just a Celery convention, rather than an + # actual protocol.) if not queue_url.startswith("http://") and not queue_url.startswith("https://"): raise ValueError( f"URL {queue_url} is should start with http:// or https://" diff --git a/images/airflow/2.8.0/python/mwaa/entrypoint.py b/images/airflow/2.8.0/python/mwaa/entrypoint.py index 922d45f..3c3824c 100644 --- a/images/airflow/2.8.0/python/mwaa/entrypoint.py +++ b/images/airflow/2.8.0/python/mwaa/entrypoint.py @@ -53,7 +53,7 @@ def abort(err_msg: str, exit_code: int = 1): F = TypeVar("F", bound=Callable[..., Any]) -def db_lock(lock_id: int, timeout: int = 300 * 1000) -> Callable[[F], F]: +def db_lock(lock_id: int, timeout_ms: int = 300 * 1000) -> Callable[[F], F]: """ Generate a decorator that can be used to protect a function by a database lock. @@ -66,8 +66,8 @@ def db_lock(lock_id: int, timeout: int = 300 * 1000) -> Callable[[F], F]: same lock ID, only one process will be granted the lock at one time. However, if the processes have different lock IDs, they will be granted the locks at the same time. - :param timeout: The maximum time the process is allowed to hold the lock. After this - time expires, the lock is automatically released. + :param timeout_ms: The maximum time, in milliseconds, the process is allowed to hold + the lock. After this time expires, the lock is automatically released. :returns A decorator that can be applied to a function to protect it with a DB lock. """ @@ -75,13 +75,13 @@ def decorator(func: F) -> F: def wrapper(*args: Any, **kwargs: Any) -> Any: func_name: str = func.__name__ db_engine: Engine = create_engine( - get_db_connection_string() # Assuming this is defined elsewhere + get_db_connection_string() ) print(f"Obtaining lock for {func_name}...") with db_engine.connect() as conn: # type: ignore try: conn.execute( # type: ignore - text("SET LOCK_TIMEOUT to :timeout"), {"timeout": timeout} + text("SET LOCK_TIMEOUT to :timeout"), {"timeout": timeout_ms} ) conn.execute( # type: ignore text("SELECT pg_advisory_lock(:id)"), {"id": lock_id} @@ -93,7 +93,7 @@ def wrapper(*args: Any, **kwargs: Any) -> Any: except Exception as e: abort( f"Failed while executing {func_name}. " + f"Error: {e}." - ) # Assuming abort is defined elsewhere + ) except Exception as e: abort( f"Failed to obtain DB lock for {func_name}. " + f"Error: {e}." diff --git a/images/airflow/2.8.0/requirements.txt b/images/airflow/2.8.0/requirements.txt index e4d7fea..94f498f 100644 --- a/images/airflow/2.8.0/requirements.txt +++ b/images/airflow/2.8.0/requirements.txt @@ -3,197 +3,24 @@ # the requirements of Airflow within the image. Still, they are largely similar # apart from some additional requirements for type checking, e.g. boto3-stubs # or similar stuff for aiding build and development, e.g. pydocstyle. ---constraint https://raw.githubusercontent.com/apache/airflow/constraints-2.7.2/constraints-3.11.txt -aiobotocore==2.6.0 -aiohttp==3.8.6 -aioitertools==0.11.0 -aiosignal==1.3.1 -alembic==1.12.0 -amqp==5.1.1 -annotated-types==0.6.0 -anyio==4.0.0 -apache-airflow==2.7.2 -apache-airflow-providers-amazon==8.7.1 -apache-airflow-providers-celery==3.3.4 -apache-airflow-providers-common-sql==1.7.2 -apache-airflow-providers-ftp==3.5.2 -apache-airflow-providers-http==4.5.2 -apache-airflow-providers-imap==3.3.2 -apache-airflow-providers-sqlite==3.4.3 -apispec==6.3.0 -argcomplete==3.1.2 -asgiref==3.7.2 -asn1crypto==1.5.1 -async-timeout==4.0.3 -attrs==23.1.0 -Babel==2.13.0 -backoff==1.10.0 -beautifulsoup4==4.12.2 -billiard==4.1.0 -blinker==1.6.3 -boto3==1.28.17 -boto3-stubs==1.28.85 -botocore==1.31.17 -botocore-stubs==1.34.41 -cachelib==0.9.0 -cattrs==23.1.2 -celery==5.3.4 -certifi==2023.7.22 -cffi==1.16.0 -charset-normalizer==3.3.0 -click==8.1.7 -click-didyoumean==0.3.0 -click-plugins==1.1.1 -click-repl==0.3.0 -clickclick==20.10.2 -colorama==0.4.6 -colorlog==4.8.0 -ConfigUpdater==3.1.1 -connexion==2.14.2 -cron-descriptor==1.4.0 -croniter==1.4.1 -cryptography==41.0.4 -Deprecated==1.2.14 -dill==0.3.1.1 -dnspython==2.4.2 -docutils==0.20.1 -email-validator==1.3.1 -Flask==2.2.5 -Flask-AppBuilder==4.3.6 -Flask-Babel==2.0.0 -Flask-Caching==2.0.2 -Flask-JWT-Extended==4.5.3 -Flask-Limiter==3.5.0 -Flask-Login==0.6.2 -Flask-Session==0.5.0 -Flask-SQLAlchemy==2.5.1 -Flask-WTF==1.2.1 -flower==2.0.1 -frozenlist==1.4.0 -fsspec==2023.12.2 -google-re2==1.1 -googleapis-common-protos==1.60.0 -graphviz==0.20.1 -greenlet==3.0.0 -grpcio==1.59.0 -gunicorn==21.2.0 -h11==0.14.0 -httpcore==0.16.3 -httpx==0.23.3 -humanize==4.8.0 -idna==3.4 -importlib-metadata==6.8.0 -importlib-resources==6.1.0 -inflection==0.5.1 -itsdangerous==2.1.2 -Jinja2==3.1.2 -jmespath==0.10.0 -jsonpath-ng==1.6.0 -jsonschema==4.19.1 -jsonschema-specifications==2023.7.1 -kombu==5.3.2 -lazy-object-proxy==1.9.0 -limits==3.6.0 -linkify-it-py==2.0.2 -lockfile==0.12.2 -lxml==4.9.3 -Mako==1.2.4 -Markdown==3.5 -markdown-it-py==3.0.0 -MarkupSafe==2.1.3 -marshmallow==3.20.1 -marshmallow-oneofschema==3.0.1 -marshmallow-sqlalchemy==0.26.1 -mdit-py-plugins==0.4.0 -mdurl==0.1.2 -multidict==6.0.4 -mypy==1.2.0 -mypy-boto3-cloudformation==1.28.83 -mypy-boto3-dynamodb==1.28.73 -mypy-boto3-ec2==1.28.85 -mypy-boto3-lambda==1.28.83 -mypy-boto3-rds==1.28.61 -mypy-boto3-s3==1.28.55 -mypy-boto3-sqs==1.28.82 -mypy-extensions==1.0.0 -nodeenv==1.8.0 -opentelemetry-api==1.20.0 -opentelemetry-exporter-otlp==1.20.0 -opentelemetry-exporter-otlp-proto-common==1.20.0 -opentelemetry-exporter-otlp-proto-grpc==1.20.0 -opentelemetry-exporter-otlp-proto-http==1.20.0 -opentelemetry-proto==1.20.0 -opentelemetry-sdk==1.20.0 -opentelemetry-semantic-conventions==0.41b0 -ordered-set==4.1.0 -packaging==23.2 -pathspec==0.11.2 -pendulum==2.1.2 -pluggy==1.3.0 -ply==3.11 -prison==0.2.1 -prometheus-client==0.17.1 -prompt-toolkit==3.0.39 -protobuf==4.21.12 -psutil==5.9.5 -psycopg2==2.9.9 -pycparser==2.21 -pycurl==7.45.3 -pydantic==2.4.2 -pydantic_core==2.10.1 -pydocstyle==6.3.0 -Pygments==2.16.1 -PyJWT==2.8.0 -pyright==1.1.351 -python-daemon==3.0.1 -python-dateutil==2.8.2 -python-nvd3==0.15.0 -python-slugify==8.0.1 -pytz==2023.3.post1 -pytzdata==2020.1 -PyYAML==6.0.1 -redshift-connector==2.0.914 -referencing==0.30.2 -requests==2.31.0 -requests-toolbelt==1.0.0 -rfc3339-validator==0.1.4 -rfc3986==1.5.0 -rich==13.6.0 -rich-argparse==1.3.0 -rpds-py==0.10.4 -ruff==0.0.292 -s3transfer==0.6.2 -scramp==1.4.4 -setproctitle==1.3.3 -six==1.16.0 -sniffio==1.3.0 -snowballstemmer==2.2.0 -soupsieve==2.5 -SQLAlchemy==1.4.49 -SQLAlchemy-JSONField==1.0.1.post0 -sqlalchemy-redshift==0.8.14 -sqlalchemy-stubs==0.4 -SQLAlchemy-Utils==0.41.1 -sqlparse==0.4.4 -statsd==4.0.1 -tabulate==0.9.0 -tenacity==8.2.3 -termcolor==2.3.0 -text-unidecode==1.3 -tornado==6.3.3 -types-awscrt==0.20.3 -types-s3transfer==0.10.0 -typing_extensions==4.8.0 -tzdata==2023.3 -uc-micro-py==1.0.2 -unicodecsv==0.14.1 -universal-pathlib==0.1.4 -urllib3==1.26.17 -vine==5.0.0 -watchtower==2.0.1 -wcwidth==0.2.8 -Werkzeug==2.2.3 -wrapt==1.15.0 -WTForms==3.0.1 -yarl==1.9.2 -zipp==3.17.0 +--constraint https://raw.githubusercontent.com/apache/airflow/constraints-2.9.0/constraints-3.11.txt +# Main Airflow packages. +# NOTE: We always specify the version here. +apache-airflow[celery,statsd]==2.9.0 +# Additional packages for MWAA Airflow. +# NOTE: We don't specify the version here since the constraints file take care +# of this, making it easier to port the code to other versions. +apache-airflow-providers-amazon[aiobotocore] +celery[sqs] +psycopg2 +pycurl +watchtower +# Additional packages for development +# NOTE: Like above, we don't specify the version here. +boto3-stubs +botocore-stubs +jinja2 +pydocstyle +pyright +ruff +sqlalchemy-stubs diff --git a/images/airflow/2.8.0/run.sh.template b/images/airflow/2.8.0/run.sh.template index 6126b24..d963756 100644 --- a/images/airflow/2.8.0/run.sh.template +++ b/images/airflow/2.8.0/run.sh.template @@ -15,4 +15,4 @@ export MWAA__SQS__QUEUE_URL="The URL of the SQS key to use with Celery>" # Build the Docker image ./build.sh -docker compose up \ No newline at end of file +docker compose up diff --git a/images/airflow/generate-dockerfiles.py b/images/airflow/generate-dockerfiles.py index 5dae6e2..da370ca 100644 --- a/images/airflow/generate-dockerfiles.py +++ b/images/airflow/generate-dockerfiles.py @@ -101,7 +101,17 @@ def generate_dockerfile( def generate_base_dockerfile(image_root_dir: Path) -> None: """Generate the Dockerfile.base file based on the Dockerfile.base.j2 - template.""" + template. + + We generate multiple Docker images for different purposes, as explained below under + the documentation of `generate_derivative_dockerfiles`. However, these derivative + images actually share most of the setup. So, to reduce build time and avoid + duplication, we generate a "base" Docker image, and then derive the rest of the + images from them. + + :param image_root_dir: The root directory of the Docker image, i.e. where the + `Dockerfile` resides. + """ # Template data data = { "bootstrapping_scripts_root_firstpass": sorted( diff --git a/quality-checks/pip_install_check.py b/quality-checks/pip_install_check.py index 0a8c213..5f20895 100755 --- a/quality-checks/pip_install_check.py +++ b/quality-checks/pip_install_check.py @@ -8,13 +8,22 @@ """ import os import sys +from pathlib import Path EMJOI_CHECK_MARK_BUTTON = "\u2705" EMJOI_CROSS_MARK = "\u274C" -def check_file_for_pip_install(filepath: str) -> bool: +# List of files that are allowed to use `pip install` directly, instead of +# `safe-pip-install`. +PIP_INSTALL_ALLOWLIST = [ + # Bootstrap steps that install Python will usually include updating `pip` + # itself so they need to make direct use of `pip`. + 'images/airflow/*/bootstrap/*/*-install-python.sh', +] + +def check_file_for_pip_install(filepath: Path) -> bool: """ Check if the file contains 'pip install'. @@ -24,12 +33,12 @@ def check_file_for_pip_install(filepath: str) -> bool: """ with open(filepath, "r") as file: for line in file: - if "pip install" in line: + if "pip install" in line or "pip3 install" in line: return False return True -def verify_no_pip_install(directory: str) -> bool: +def verify_no_pip_install(directory: Path) -> bool: """ Verify there is no direct use of `pip install` in the directory tree. @@ -38,21 +47,21 @@ def verify_no_pip_install(directory: str) -> bool: :returns True if the verification succeeds, otherwise False. """ # Check if the directory exists - if not os.path.isdir(directory): + if not directory.is_dir(): print(f"The directory {directory} does not exist.") return True - # Walk through the directory tree + # Walk through the shell scripts in the directory tree. ret_code = True - for root, _dirs, files in os.walk(directory): - for filename in files: - if filename.endswith(".sh"): # Check for bash scripts - filepath = os.path.join(root, filename) - if check_file_for_pip_install(filepath): - print(f"{EMJOI_CHECK_MARK_BUTTON} {filepath}") - else: - print(f"{EMJOI_CROSS_MARK} {filepath}.") - ret_code = False + for filepath in directory.glob('**/*.sh'): + if any(filepath.match(p) for p in PIP_INSTALL_ALLOWLIST): + print(f"Ignoring {filepath} since it is in the allowlist.") + continue + if check_file_for_pip_install(filepath): + print(f"{EMJOI_CHECK_MARK_BUTTON} {filepath}") + else: + print(f"{EMJOI_CROSS_MARK} {filepath}.") + ret_code = False return ret_code @@ -78,7 +87,7 @@ def main() -> None: """Start execution of the script.""" verify_in_repo_root() - if verify_no_pip_install("./"): + if verify_no_pip_install(Path("./images/airflow")): sys.exit(0) else: print(