DataBiosphere · LizBaldo · Oct 11, 2023 · Oct 12, 2023 · Oct 12, 2023 · Oct 12, 2023
diff --git a/.github/workflows/test-terra-base-jupyter.yml b/.github/workflows/test-terra-base-jupyter.yml
@@ -0,0 +1,89 @@
+name: Test terra-base-jupyter
+# Perform smoke tests on the terra-base-jupyter Docker image to have some amount of confidence that
+# Python package versions are compatible.
+#
+# To configure the minimal auth needed for these tests to be able to read public data from Google Cloud Platform:
+# Step 1: Create a service account per these instructions:
+#         https://github.com/google-github-actions/setup-gcloud/blob/master/setup-gcloud/README.md
+# Step 2: Give the service account the following permissions within the project: BigQuery User
+# Step 3: Store its key and project id as GitHub repository secrets TD_GCP_SA_KEY and GCP_PROJECT_ID.
+#         https://docs.github.com/en/free-pro-team@latest/actions/reference/encrypted-secrets#creating-encrypted-secrets-for-a-repository
+
+on:
+  pull_request:
+    branches: [ master ]
+    paths:
+    - 'terra-base-jupyter/**'
+    - '.github/workflows/test-terra-base-jupyter.yml'
+
+  push:
+    # Note: GitHub secrets are not passed to pull requests from forks. For community contributions from
+    # regular contributors, its a good idea for the contributor to configure the GitHub actions to run correctly
+    # in their fork as described above.
+    #
+    # For occasional contributors, the dev team will merge the PR fork branch to a branch in upstream named
+    # test-community-contribution-<PR#> to run all the GitHub Action smoke tests.
+    branches: [ 'test-community-contribution*' ]
+    paths:
+    - 'terra-base-jupyter/**'
+    - '.github/workflows/test-terra-base-jupyter.yml'
+
+  workflow_dispatch:
+    # Allows manually triggering of workflow on a selected branch via the GitHub Actions tab.
+    # GitHub blog demo: https://github.blog/changelog/2020-07-06-github-actions-manual-triggers-with-workflow_dispatch/.
+
+env:
+  GOOGLE_PROJECT: ${{ secrets.GCP_PROJECT_ID }}
+
+jobs:
+
+  test_docker_image:
+    runs-on: self-hosted
+
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v4
+
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.10'
+
+    - name: Free up some disk space
+      run: sudo rm -rf /usr/share/dotnet
+
+    - id: auth
+      uses: google-github-actions/auth@v2
+      with:
+        credentials_json: ${{ secrets.TD_GCP_SA_KEY }}
+        create_credentials_file: true
+
+    - name: Set up Cloud SDK
+      uses: google-github-actions/[email protected]
+      with:
+        project_id: ${{ secrets.GCP_PROJECT_ID }}
+
+    - name: Build Docker image and base images too, if needed
+      run: |
+        gcloud auth configure-docker
+        ./build_smoke_test_image.sh terra-base-jupyter
+
+    - name: Upload workflow artifacts
+      uses: actions/upload-artifact@v2
+      with:
+        name: notebook-execution-results
+        path: terra-base-jupyter/tests/*.html
+        retention-days: 30
+
+    - name: Test Python code with pytest
+      run: |
+        chmod a+r "${{ steps.auth.outputs.credentials_file_path }}"
+        docker run \
+          --env GOOGLE_PROJECT \
+          --volume "${{ steps.auth.outputs.credentials_file_path }}":/tmp/credentials.json:ro \
+          --env GOOGLE_APPLICATION_CREDENTIALS="/tmp/credentials.json" \
+          --volume $GITHUB_WORKSPACE/terra-base-jupyter/tests:/tests \
+          --workdir=/tests \
+          --entrypoint="" \
+          terra-base-jupyter:smoke-test \
+          /bin/sh -c "pip3 install pytest; pytest"
diff --git a/.gitignore b/.gitignore
@@ -17,4 +17,5 @@ package-lock.json
 .python_history
 .keras/
 .ammonite/
-.metals/
+.metals/
+.venv/
diff --git a/build_smoke_test_image.sh b/build_smoke_test_image.sh
@@ -15,7 +15,7 @@ set -o xtrace
 build_smoke_test_image() {
   local IMAGE_TYPE=$1
   pushd ${IMAGE_TYPE}
-  local BASE_IMAGES=$( egrep '^FROM (\S+)' Dockerfile |tr -s ' ' | cut -d ' ' -f 2 )
+  local BASE_IMAGES=$( egrep '^FROM (\S+)' Dockerfile | sed 's/--platform.*//' |tr -s ' ' | cut -d ' ' -f 2 )
 
   local BASE_IMAGE
   for BASE_IMAGE in ${BASE_IMAGES}; do

diff --git a/terra-base-jupyter/Dockerfile b/terra-base-jupyter/Dockerfile
@@ -0,0 +1,198 @@
+# Smallest image with ubuntu jammy, CUDA and NVDIA drivers installed - 80 mb
+FROM --platform=linux/amd64 nvidia/cuda:12.2.0-base-ubuntu22.04
+
+# Use bash as the shell, like the jupyter terminal (just nicer to work with than sh)
+ENV SHELL /usr/bin/bash
+SHELL ["/usr/bin/bash", "-c"]
+
+#######################
+# Environment Variables
+#######################
+ENV DEBIAN_FRONTEND noninteractive
+ENV LC_ALL en_US.UTF-8
+
+# We need node >18 for jupyter to work
+ENV NODE_MAJOR 20
+
+# Set the python version and corresponding conda installer
+ENV PYTHON_VERSION 3.10
+ENV CONDA_INSTALLER https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.1-0-Linux-x86_64.sh
+
+###############
+# Prerequisites
+###############
+RUN apt-get update && apt-get install -yq --no-install-recommends \
+    # basic necessities 
+    sudo \
+    ca-certificates \ 
+    curl \
+    jq \
+    tree \
+    # gnupg requirement
+    gnupg \
+    dirmngr \
+    # useful utilities for debugging within docker itself
+    nano \
+    less \
+    procps \
+    lsb-release \
+    # gcc compiler
+    build-essential \
+    locales \
+    # for ssh-agent and ssh-add
+    keychain \
+    # extras \
+    wget \
+    aria2 \
+    bzip2 \
+    # git
+    git \
+    # Uncomment en_US.UTF-8 for inclusion in generation
+    && sed -i 's/^# *\(en_US.UTF-8\)/\1/' /etc/locale.gen \
+    # Generate locale
+    && locale-gen \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Node >18 (needed for jupyterlab)
+RUN apt-get update && apt-get install -yq --no-install-recommends 
+RUN mkdir -p /etc/apt/keyrings
+RUN curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg
+
+RUN echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_$NODE_MAJOR.x nodistro main" | tee /etc/apt/sources.list.d/nodesource.list
+RUN apt-get update && apt-get install -f -yq nodejs
+
+#############
+# Users Setup
+#############
+# Create the welder user
+# The welder uid is consistent with the Welder docker definition here:
+#  https://github.com/DataBiosphere/welder/blob/master/project/Settings.scala
+# Adding welder-user to the Jupyter container isn't strictly required, but it makes welder-added
+# files display nicer when viewed in a terminal.
+ENV WELDER_USER welder-user
+ENV WELDER_UID 1001
+RUN useradd -m -N -u $WELDER_UID $WELDER_USER
+
+# Create the jupyter user
+ENV JUPYTER_USER jupyter-user
+ENV JUPYTER_UID 1002
+# Create the jupyter user home
+ENV JUPYTER_USER_HOME /home/$JUPYTER_USER
+RUN useradd -m -d $JUPYTER_USER_HOME -N -u $JUPYTER_UID -g users $JUPYTER_USER
+# We want to grant the jupyter user sudo permissions
+# without password so they can install the necessary packages that they 
+# want to use on the docker container
+RUN echo "$JUPYTER_USER ALL=(ALL) NOPASSWD: ALL" > /etc/sudoers.d/$JUPYTER_USER \
+    && chmod 0440 /etc/sudoers.d/$JUPYTER_USER
+
+#####################################
+# Install Python via Miniconda
+#####################################
+## Note: CONDA should NOT be used by terra devs to manage dependencies (see the use of poetry below instead),
+## but is a widely used tool to manage python environments in a runtime and we should provide it to users
+## We want to store the user conda environments in a directory
+## that will be in the persistent disk
+## Attention: If you change the Conda home location, please update conda_init.txt accordingly
+ENV CONDA_ENV_NAME base-python${PYTHON_VERSION}
+ENV CONDA_ENV_HOME $JUPYTER_USER_HOME/.envs/$CONDA_ENV_NAME
+RUN curl -so $JUPYTER_USER_HOME/miniconda.sh ${CONDA_INSTALLER} \
+    && chmod +x $JUPYTER_USER_HOME/miniconda.sh \
+    && $JUPYTER_USER_HOME/miniconda.sh -b -p $CONDA_ENV_HOME \
+    && rm $JUPYTER_USER_HOME/miniconda.sh
+ENV PATH "${PATH}:${CONDA_ENV_HOME}/bin"
+
+# Set up the path to the user python
+ENV BASE_PYTHON_PATH $CONDA_ENV_HOME/bin/python
+# Tell condo to NOT write bite code (aka these.pyc files)
+ENV PYTHONDONTWRITEBYTECODE=true
+
+###################################################
+# Set up the user to use the conda base environment
+###################################################
+## The user should have full access to the conda base environment, and can use it directly, or
+## create new conda environments on top of it. The important part is that jupyter IS NOT installed
+## in the base environment to provide isolation between the user environment, and the jupyter server
+## to avoid cross-contamination
+COPY conda-environment.yml .
+RUN conda env update --prefix $CONDA_ENV_HOME --file conda-environment.yml --prune \
+    # Remove packages tarballs and python bytecode files from the image
+    && conda clean -afy \
+    && rm conda-environment.yml \
+    # Make sure the JUPYTER_USER is the owner of the folder where
+    # the base conda is installed
+    && chown -R $JUPYTER_USER:users $JUPYTER_USER_HOME
+
+# Add the user base conda environment as a jupyter kernel - this should be the default now
+# This commands activates the conda environment and then calls ipykernel from within
+# to install it as a kernel under the same name
+RUN conda run -p ${CONDA_ENV_HOME} python -m ipykernel install --name=$CONDA_ENV_NAME
+
+# Prep the jupyter terminal to conda init and make sure the base conda environment is 
+# activated and the name is displayed in the terminal prompt
+COPY conda_init.txt .
+RUN cat conda_init.txt >> $JUPYTER_USER_HOME/.bashrc && \
+    printf "\nconda activate ${CONDA_ENV_HOME}" >> $JUPYTER_USER_HOME/.bashrc && \
+    conda config --set env_prompt '({name})' && \
+    source $JUPYTER_USER_HOME/.bashrc && \
+    rm conda_init.txt
+
+####################################################
+# Install Jupyter in an isolated virtual environment
+####################################################
+## Virtualenv and POETRY are the prefered tool to create virtual environments and
+## manage dependencies for Terra Devs - poetry docs: https://python-poetry.org/docs/
+ENV POETRY_HOME /opt/poetry
+# Append POETRY_HOME to PATH
+ENV PATH "${PATH}:${POETRY_HOME}/bin"
+COPY poetry.lock .
+COPY pyproject.toml .
+
+ENV JUPYTER_HOME /usr/jupytervenv
+# Add jupyter virtual environmemt to PATH,
+# but make sure to add it at the end so that the 
+# Conda base python takes precedence 
+# (aka the ! operator in iPython shells should NOT access the jupyter virtualenvironment)
+ENV PATH "${PATH}:${JUPYTER_HOME}/bin"
+
+# Install Poetry, set up the virtual environment for jupyter to run and then cleanup / uninstall poetry
+RUN curl -sSL https://install.python-poetry.org | POETRY_HOME=$POETRY_HOME $BASE_PYTHON_PATH \
+    # Create a virtual environment and activate it for poetry to use
+    && $BASE_PYTHON_PATH -m venv $JUPYTER_HOME \
+    && source $JUPYTER_HOME/bin/activate \
+    # Install python dependencies with poetry
+    && poetry install --no-interaction --no-ansi --no-dev --no-cache \
+    # Cleanup
+    && rm poetry.lock && rm pyproject.toml \
+    && curl -sSL https://install.python-poetry.org | POETRY_HOME=$POETRY_HOME $BASE_PYTHON_PATH - --uninstall
+
+# ##################################
+# # Terra-specific Jupyter Utilities
+# ##################################
+# Ensure this matches c.ServerApp.port in 'jupyter_server_config.py'
+ENV JUPYTER_PORT 8888
+EXPOSE $JUPYTER_PORT
+
+# Install the custom extensions to enable welder for file syncing
+COPY custom $JUPYTER_HOME/etc/jupyter/custom
+COPY custom/jupyter_delocalize.py $JUPYTER_HOME/lib/python${PYTHON_VERSION}/site-packages
+COPY jupyter_server_config.py $JUPYTER_HOME/etc/jupyter
+
+# Remove the jupyter environment from the list of available kernels so it is hidden from the user
+# Note that this needs to be done with setting the c.KernelSpecManager.ensure_native_kernel flag
+# to False in 'jupyter_server_config.py'
+RUN $JUPYTER_HOME/bin/jupyter kernelspec remove python3 -y
+
+# Copy the script that the service deploying to Terra (e.g. leonardo) will use for docker exec
+COPY run-jupyter.sh $JUPYTER_HOME/run-jupyter.sh
+RUN chmod +x $JUPYTER_HOME/run-jupyter.sh
+
+# Set up the user and working directory, which is where the persistent disk will be mounted
+USER $JUPYTER_USER
+WORKDIR $JUPYTER_USER_HOME/persistent_disk
+
+# Note: this entrypoint is provided for running Jupyter independently of Leonardo.
+# When Leonardo deploys this image onto a cluster, the entrypoint is overwritten to enable
+# additional setup inside the container before execution.  Jupyter execution occurs when the
+# init-actions.sh script uses 'docker exec' to call run-jupyter.sh.
+ENTRYPOINT ["/usr/jupytervenv/bin/jupyter", "lab"]