Skip to content

Commit

Permalink
backup: local
Browse files Browse the repository at this point in the history
Do not merge.
Do not delete.
This branch is a backup.
Thanks.
  • Loading branch information
hacherix committed Jan 16, 2025
1 parent 5ed34d0 commit a1d00e5
Show file tree
Hide file tree
Showing 6 changed files with 58 additions and 68 deletions.
38 changes: 11 additions & 27 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,23 +1,10 @@

FROM apache/airflow:2.10.0-python3.10
FROM apache/airflow:2.7.1-python3.9

USER root

ARG AIRFLOW_HOME=/opt/airflow

ADD dags /opt/airflow/dags

ADD airflow.cfg /opt/airflow/airflow.cfg

USER airflow

RUN pip install --upgrade pip
ARG AIRFLOW_HOME=/opt/airflow

USER root

# MySQL key rotation (https://dev.mysql.com/doc/refman/8.0/en/checking-gpg-signature.html)
# RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys A8D3785C

RUN apt-get update -y
RUN apt-get install git -y
RUN apt-get install lftp -y
Expand All @@ -28,23 +15,20 @@ RUN apt-get install nano -y
RUN apt-get install jq -y
RUN apt-get install libmagic1 -y

RUN chown -R "airflow:root" /opt/airflow/

RUN chown -R "airflow:root" ${AIRFLOW_HOME}
ADD ssh /home/airflow/.ssh/
RUN chown -R airflow:root /home/airflow/.ssh

USER airflow

RUN pip install --trusted-host pypi.org --trusted-host files.pythonhosted.org boto3
RUN chown -R "airflow:root" /home/airflow/.ssh


# USER ${AIRFLOW_UID}
USER airflow

RUN pip install --upgrade pip
RUN pip install --trusted-host pypi.org --trusted-host files.pythonhosted.org boto3
ADD requirements.txt /requirements.txt

RUN pip install -r /requirements.txt

RUN git config --global user.email "your email"
RUN git config --global user.name "your username"
ARG USER_NAME
ARG USER_EMAIL
RUN git config --global user.email "${USER_EMAIL}"
RUN git config --global user.name "${USER_NAME}"

ADD airflow.cfg ${AIRFLOW_HOME}/airflow.cfg
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,15 @@ L'infrastructure actuelle est basée sur du LocalExecutor (le scheduler, le webs
## Installation

```
git clone [email protected]:etalab/data-engineering-stack.git
git clone [email protected]:datagouv/data-engineering-stack.git
cd data-engineering-stack
# Create directories necessary for Airflow to work
./1_prepareDirs.sh
# Prepare .env file
# Prepare .env file
./2_prepare_env.sh
nano .env
nano .env
# Edit POSTGRES_USER ; POSTGRES_PASSWORD ; POSTGRES_DB ; AIRFLOW_ADMIN_MAIL ; AIRFLOW_ADMIN_FIRSTNAME ; AIRFLOW_ADMIN_NAME ; AIRFLOW_ADMIN_PASSWORD
# Launch services
Expand All @@ -31,6 +31,6 @@ docker-compose up --build -d
./refreshBagDags.sh
```

## Connections
## Variables and connections

Connections can be created manually or with python scripts `createConn.py` (using Airflow API) inside each projects. You need also to add your ssh key inside `ssh` folder of repository for the container to be able to see it in `/home/airflow/.ssh/` folder of container.
8 changes: 4 additions & 4 deletions airflow.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ plugins_folder = /opt/airflow/plugins
execute_tasks_new_python_interpreter = False

# Secret key to save connection passwords in the db
fernet_key =
fernet_key =

# Whether to disable pickling dags
donot_pickle = True
Expand Down Expand Up @@ -525,7 +525,7 @@ expose_hostname = True
expose_stacktrace = True

# Default DAG view. Valid values are: ``tree``, ``graph``, ``duration``, ``gantt``, ``landing_times``
dag_default_view = tree
dag_default_view = grid

# Default DAG orientation. Valid values are:
# ``LR`` (Left->Right), ``TB`` (Top->Bottom), ``RL`` (Right->Left), ``BT`` (Bottom->Top)
Expand Down Expand Up @@ -881,7 +881,7 @@ catchup_by_default = True
# - excessive locking
# Additionally, you may hit the maximum allowable query length for your db.
# Set this to 0 for no limit (not advised)
max_tis_per_query = 512
max_tis_per_query = 16

# Should the scheduler issue ``SELECT ... FOR UPDATE`` in relevant queries.
# If this is set to False then you should not run more than a single
Expand Down Expand Up @@ -1072,4 +1072,4 @@ shard_code_upper_limit = 10000
shards = 5

# comma separated sensor classes support in smart_sensor.
sensors_enabled = NamedHivePartitionSensor
sensors_enabled = NamedHivePartitionSensor
14 changes: 9 additions & 5 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
version: "3"
services:
postgres:
image: postgres:12
user: "${AIRFLOW_UID}:${AIRFLOW_GID}"
user: root
volumes:
- ./pg-airflow:/var/lib/postgresql/data
restart: unless-stopped
env_file:
- .env
ports:
Expand All @@ -14,22 +14,26 @@ services:
build:
context: .
dockerfile: Dockerfile
args:
- USER_EMAIL=$USER_EMAIL
- USER_NAME=$USER_NAME
hostname: webserver
restart: always
restart: unless-stopped
depends_on:
- postgres
command: webserver
env_file:
- .env
volumes:
- ./dags:/opt/airflow/dags
- ${LOCAL_AIRFLOW_DAG_PATH}:/opt/airflow/dags/datagouvfr_data_pipelines
- ${LOCAL_TMP_PATH}:/tmp
- ./scripts:/opt/airflow/scripts
- ./logs:/opt/airflow/logs
- ./plugins:/opt/airflow/plugins
- ./requirements.txt:/opt/airflow/requirements.txt
ports:
- "127.0.0.1:${AIRFLOW_WEBSERVER_PORT}:8080"
- "127.0.0.1:${AIRFLOW_LOG_SERVER_PORT}:8793"
- "127.0.0.1:${AIRFLOW_LOG_SERVER_PORT}:8794"
entrypoint: ./scripts/airflow-entrypoint.sh
healthcheck:
test: ["CMD-SHELL", "[ -f /opt/airflow/airflow-webserver.pid ]"]
Expand Down
52 changes: 28 additions & 24 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,33 +1,37 @@
GitPython
table-schema-to-markdown==0.4.12
frictionless==4.25.1
jsonschema
Unidecode==1.3.6
PyYAML
geojson
shapely
minio==7.2.8
boto3==1.35.0
gitpython==3.1.27
table-schema-to-markdown==0.4.13
faust-cchardet==2.1.19
frictionless==5.18.0
jsonschema==4.23.0
unidecode==1.3.6
pyyaml==6.0
geojson==2.5.0
shapely==1.8.2
minio==7.1.3
boto3==1.26.65
emails==0.6
pandas==2.2.2
papermill==2.6.0
plotly==5.24.0
plotly_express==0.4.1
pandas==2.2.3
papermill==2.4.0
plotly==5.6.0
plotly-express==0.4.1
kaleido==0.2.1
ipykernel==5.5.6
nbconvert==6.5.1
openpyxl==3.1.5
nbconvert==6.5.3
openpyxl==3.1.2
elasticsearch==7.17.0
elasticsearch_dsl==7.4.0
python-dotenv==0.21.0
elasticsearch-dsl==7.4.0
requests==2.31.0
python-dotenv==0.20.0
swifter==1.1.3
tweepy==4.8.0
pytest==7.2.1
psycopg2-binary==2.9.5
py7zr==0.20.4
xlsxwriter==3.0.8
langdetect==1.0.9
pydantic==2.4.0
pyproj==3.6.1
requests==2.32.3
swifter==1.4.0
rdflib==6.3.2
feedgen==1.0.0
duckdb==0.10.2
python-magic==0.4.27
asyncpg==0.29.0
uvloop==0.20.0
python-frontmatter==0.5.0
tenacity==9.0.0
6 changes: 2 additions & 4 deletions scripts/airflow-entrypoint.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
#!/usr/bin/env bash
airflow resetdb
airflow db init
airflow upgradedb
airflow db reset
airflow db migrate
airflow users create -r Admin -u "$AIRFLOW_ADMIN_MAIL" -e "$AIRFLOW_ADMIN_MAIL" -f "$AIRFLOW_ADMIN_FIRSTNAME" -l "$AIRFLOW_ADMIN_NAME" -p "$AIRFLOW_ADMIN_PASSWORD"
airflow scheduler &
airflow webserver

0 comments on commit a1d00e5

Please sign in to comment.