diff --git a/.github/matrix-configs.json b/.github/matrix-configs.json new file mode 100644 index 000000000..a1be0b14b --- /dev/null +++ b/.github/matrix-configs.json @@ -0,0 +1,100 @@ +[ + { + "model": { + "runs_on": "self-hosted", + "name": "falcon-7b", + "dockerfile": "docker/presets/falcon/Dockerfile", + "build_args": "--build-arg FALCON_MODEL_NAME=tiiuae/falcon-7b" + }, + "shouldBuildFalcon": "true" + }, + { + "model": { + "runs_on": "self-hosted", + "name": "falcon-7b-instruct", + "dockerfile": "docker/presets/falcon/Dockerfile", + "build_args": "--build-arg FALCON_MODEL_NAME=tiiuae/falcon-7b-instruct" + }, + "shouldBuildFalcon": "true" + }, + + { + "model": { + "runs_on": "self-hosted", + "name": "falcon-40b", + "dockerfile": "docker/presets/falcon/Dockerfile", + "build_args": "--build-arg FALCON_MODEL_NAME=tiiuae/falcon-40b" + }, + "shouldBuildFalcon": "true" + }, + + { + "model": { + "runs_on": "self-hosted", + "name": "falcon-40b-instruct", + "dockerfile": "docker/presets/falcon/Dockerfile", + "build_args": "--build-arg FALCON_MODEL_NAME=tiiuae/falcon-40b-instruct" + }, + "shouldBuildFalcon": "true" + }, + + { + "model": { + "runs_on": "self-hosted", + "name": "llama-2-7b", + "dockerfile": "docker/presets/llama-2/Dockerfile", + "build_args": "--build-arg LLAMA_VERSION=llama-2-7b --build-arg SRC_DIR=/home/presets/llama-2" + }, + "shouldBuildLlama2": "true" + }, + + { + "model": { + "runs_on": "self-hosted", + "name": "llama-2-13b", + "dockerfile": "docker/presets/llama-2/Dockerfile", + "build_args": "--build-arg LLAMA_VERSION=llama-2-13b --build-arg SRC_DIR=/home/presets/llama-2" + }, + "shouldBuildLlama2": "true" + }, + + { + "model": { + "runs_on": "self-hosted", + "name": "llama-2-70b", + "dockerfile": "docker/presets/llama-2/Dockerfile", + "build_args": "--build-arg LLAMA_VERSION=llama-2-70b --build-arg SRC_DIR=/home/presets/llama-2" + }, + "shouldBuildLlama2": "true" + }, + + { + "model": { + "runs_on": "self-hosted", + "name": "llama-2-7b-chat", + "dockerfile": "docker/presets/llama-2/Dockerfile", + "build_args": "--build-arg LLAMA_VERSION=llama-2-7b-chat --build-arg SRC_DIR=/home/presets/llama-2-chat" + }, + "shouldBuildLlama2Chat": "true" + }, + + { + "model": { + "runs_on": "self-hosted", + "name": "llama-2-13b-chat", + "dockerfile": "docker/presets/llama-2/Dockerfile", + "build_args": "--build-arg LLAMA_VERSION=llama-2-13b-chat --build-arg SRC_DIR=/home/presets/llama-2-chat" + }, + "shouldBuildLlama2Chat": "true" + }, + + { + "model": { + "runs_on": "self-hosted", + "name": "llama-2-70b-chat", + "dockerfile": "docker/presets/llama-2/Dockerfile", + "build_args": "--build-arg LLAMA_VERSION=llama-2-70b-chat --build-arg SRC_DIR=/home/presets/llama-2-chat" + }, + "shouldBuildLlama2Chat": "true" + } +] diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml index ad9a42109..39f1cf515 100644 --- a/.github/workflows/e2e-preset-test.yml +++ b/.github/workflows/e2e-preset-test.yml @@ -5,8 +5,11 @@ on: workflows: ["Build and Push Preset Models"] types: - completed - workflow_dispatch: {} - + workflow_dispatch: + inputs: + image_tag: + description: 'Image Tag' + required: true env: GO_VERSION: "1.20" @@ -15,7 +18,70 @@ permissions: contents: read jobs: + setup: + if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' + runs-on: self-hosted + outputs: + IMG_TAG: ${{ steps.set_final_tag.outputs.IMG_TAG }} + steps: + - name: Determine tag from dispatch + if: github.event_name == 'workflow_dispatch' + id: determine_tag + run: echo "IMG_TAG=${{ github.event.inputs.image_tag }}" >> $GITHUB_OUTPUT + + - name: Download tag artifact + if: github.event_name == 'workflow_run' + uses: actions/github-script@v6 + with: + github-token: ${{ secrets.KAITO_ACCESS_TOKEN_READ }} + script: | + let allArtifacts = await github.rest.actions.listWorkflowRunArtifacts({ + owner: context.repo.owner, + repo: context.repo.repo, + run_id: context.payload.workflow_run.id, + }); + let matchArtifact = allArtifacts.data.artifacts.filter((artifact) => { + return artifact.name == "artifacts" + })[0]; + let download = await github.rest.actions.downloadArtifact({ + owner: context.repo.owner, + repo: context.repo.repo, + artifact_id: matchArtifact.id, + archive_format: 'zip', + }); + let fs = require('fs'); + fs.writeFileSync(`/tmp/artifacts.zip`, Buffer.from(download.data)); + - name: Unzip tag artifact + if: github.event_name == 'workflow_run' + run: | + mkdir -p /tmp/artifacts + unzip /tmp/artifacts.zip -d /tmp/artifacts + shell: bash + - name: Display downloaded aritifacts + if: github.event_name == 'workflow_run' + run: | + echo "Downloaded artifacts:" + ls -ablh /tmp/artifacts + shell: bash + - name: Parse artifacts and assign GA environment variables + if: github.event_name == 'workflow_run' + id: get_image_tag + run: | + tag=$(tail -n 1 /tmp/artifacts/tag.txt) + echo "IMG_TAG=$tag" >> $GITHUB_OUTPUT + + - name: Set final image tag + id: set_final_tag + run: | + if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then + echo "IMG_TAG=${{ steps.determine_tag.outputs.IMG_TAG }}" >> $GITHUB_OUTPUT + else + echo "IMG_TAG=${{ steps.get_image_tag.outputs.IMG_TAG }}" >> $GITHUB_OUTPUT + fi + e2e-preset-tests: + if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' + needs: setup runs-on: self-hosted strategy: fail-fast: false @@ -26,49 +92,48 @@ jobs: node-vm-size: Standard_NC12s_v3 node-osdisk-size: 100 - # - name: falcon-7b-instruct - # node-count: 1 - # node-vm-size: Standard_NC12s_v3 - # node-osdisk-size: 100 + - name: falcon-7b-instruct + node-count: 1 + node-vm-size: Standard_NC12s_v3 + node-osdisk-size: 100 - # Uncomment once supported by ACR - # - name: falcon-40b - # node-count: 1 - # node-vm-size: Standard_NC96ads_A100_v4 - # node-osdisk-size: 400 + - name: falcon-40b + node-count: 1 + node-vm-size: Standard_NC96ads_A100_v4 + node-osdisk-size: 400 - # - name: falcon-40b-instruct - # node-count: 1 - # node-vm-size: Standard_NC96ads_A100_v4 - # node-osdisk-size: 400 + - name: falcon-40b-instruct + node-count: 1 + node-vm-size: Standard_NC96ads_A100_v4 + node-osdisk-size: 400 - # - name: llama-2-7b - # node-count: 1 - # node-vm-size: Standard_NC12s_v3 - # node-osdisk-size: 100 + - name: llama-2-7b + node-count: 1 + node-vm-size: Standard_NC12s_v3 + node-osdisk-size: 100 - # - name: llama-2-13b - # node-count: 2 - # node-vm-size: Standard_NC12s_v3 - # node-osdisk-size: 150 + - name: llama-2-13b + node-count: 2 + node-vm-size: Standard_NC12s_v3 + node-osdisk-size: 150 - # Uncomment once supported by ACR + # Uncomment once service/deployment made # - name: llama-2-70b # node-count: 2 # node-vm-size: Standard_NC96ads_A100_v4 # node-osdisk-size: 400 - # - name: llama-2-7b-chat - # node-count: 1 - # node-vm-size: Standard_NC12s_v3 - # node-osdisk-size: 100 + - name: llama-2-7b-chat + node-count: 1 + node-vm-size: Standard_NC12s_v3 + node-osdisk-size: 100 - # - name: llama-2-13b-chat - # node-count: 2 - # node-vm-size: Standard_NC12s_v3 - # node-osdisk-size: 150 + - name: llama-2-13b-chat + node-count: 2 + node-vm-size: Standard_NC12s_v3 + node-osdisk-size: 150 - # Uncomment once supported by ACR + # Uncomment once service/deployment made # - name: llama-2-70b-chat # node-count: 2 # node-vm-size: Standard_NC96ads_A100_v4 @@ -80,21 +145,12 @@ jobs: with: submodules: true fetch-depth: 0 - - - name: Download image tag artifact - uses: actions/download-artifact@v3 - id: download - with: - name: image-tag-artifact - - - name: Read image tag - id: get_image_tag + + - name: Get ACR Name + id: get_acr_name run: | - tag=$(cat image_tag.txt) - echo "IMAGE_TAG=$tag" >> $GITHUB_OUTPUT - # Set the ACR based on the tag value - if [[ "$tag" == "latest" ]]; then + if [[ "${{ needs.setup.outputs.IMG_TAG }}" == "latest" ]]; then echo "ACR_NAME=aimodelsregistry" >> $GITHUB_OUTPUT else echo "ACR_NAME=aimodelsregistrytest" >> $GITHUB_OUTPUT @@ -102,23 +158,46 @@ jobs: - name: Install Azure CLI latest run: | - curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash + if ! which az > /dev/null; then + echo "Azure CLI not found. Installing..." + curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash + else + echo "Azure CLI already installed." + fi - - name: Az CLI login + - name: 'Az CLI login' uses: azure/login@v1.4.6 with: - client-id: ${{ secrets.AZURE_KDM_PRESET_CLIENT_ID }} + client-id: ${{ secrets.AZURE_KDM_PRESET_SELF_RUNNER_CLIENT_ID }} tenant-id: ${{ secrets.AZURE_TENANT_ID }} - subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + allow-no-subscriptions: true + + - name: 'Set subscription' + run: az account set --subscription ${{secrets.AZURE_SUBSCRIPTION_ID}} - - name: 'Login to ACR' - run: az acr login --name ${{ steps.get_image_tag.outputs.ACR_NAME }} + - name: 'Check if Image exists in ACR' + id: check_image + run: | + ACR_NAME=${{ steps.get_acr_name.outputs.ACR_NAME }} + IMAGE_NAME=${{ matrix.image.name }} + TAG=${{ needs.setup.outputs.IMG_TAG }} + + TAGS=$(az acr repository show-tags -n $ACR_NAME --repository $IMAGE_NAME --output tsv) + + if echo "$TAGS" | grep -q "^$TAG$"; then + echo "IMAGE_EXISTS=true" >> $GITHUB_OUTPUT + else + echo "IMAGE_EXISTS=false" >> $GITHUB_OUTPUT + echo "Image $IMAGE_NAME:$TAG not found in $ACR_NAME." + fi - name: Set up kubectl context + if: steps.check_image.outputs.IMAGE_EXISTS == 'true' run: | - az aks get-credentials --resource-group llm-test --name new_demo + az aks get-credentials --resource-group llm-test --name GitRunner - name: Get Nodepool Name + if: steps.check_image.outputs.IMAGE_EXISTS == 'true' id: get_nodepool_name run: | NAME_SUFFIX=${{ matrix.image.name }} @@ -133,17 +212,18 @@ jobs: echo "NODEPOOL_NAME=$TRUNCATED_NAME_SUFFIX" >> $GITHUB_OUTPUT - name: Create Nodepool + if: steps.check_image.outputs.IMAGE_EXISTS == 'true' run: | NODEPOOL_EXIST=$(az aks nodepool show \ --name ${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \ - --cluster-name new_demo \ + --cluster-name GitRunner \ --resource-group llm-test \ --query 'name' -o tsv || echo "") echo "NODEPOOL_EXIST: $NODEPOOL_EXIST" if [ -z "$NODEPOOL_EXIST" ]; then az aks nodepool add \ --name ${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \ - --cluster-name new_demo \ + --cluster-name GitRunner \ --resource-group llm-test \ --node-count ${{ matrix.image.node-count }} \ --node-vm-size ${{ matrix.image.node-vm-size }} \ @@ -154,7 +234,7 @@ jobs: else NODEPOOL_STATE=$(az aks nodepool show \ --name ${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \ - --cluster-name new_demo \ + --cluster-name GitRunner \ --resource-group llm-test \ --query 'provisioningState' -o tsv) echo "NODEPOOL_STATE: $NODEPOOL_STATE" @@ -167,9 +247,11 @@ jobs: fi - name: Create Service - run: kubectl apply -f pkg/presets/k8s/${{ matrix.image.name }}/${{ matrix.image.name }}-service.yaml + if: steps.check_image.outputs.IMAGE_EXISTS == 'true' + run: kubectl apply -f presets/k8s/${{ matrix.image.name }}/${{ matrix.image.name }}-service.yaml - name: Retrieve External Service IP + if: steps.check_image.outputs.IMAGE_EXISTS == 'true' id: get_ip run: | while [[ -z $SERVICE_IP ]]; do @@ -180,25 +262,30 @@ jobs: echo "SERVICE_IP=$SERVICE_IP" >> $GITHUB_OUTPUT - name: Replace IP and Deploy Statefulset to K8s + if: steps.check_image.outputs.IMAGE_EXISTS == 'true' run: | - sed -i "s/MASTER_ADDR_HERE/${{ steps.get_ip.outputs.SERVICE_IP }}/g" pkg/presets/k8s/${{ matrix.image.name }}/${{ matrix.image.name }}-statefulset.yaml - sed -i "s/TAG_HERE/${{ steps.get_image_tag.outputs.IMAGE_TAG }}/g" pkg/presets/k8s/${{ matrix.image.name }}/${{ matrix.image.name }}-statefulset.yaml - sed -i "s/REPO_HERE/${{ steps.get_image_tag.outputs.ACR_NAME }}/g" pkg/presets/k8s/${{ matrix.image.name }}/${{ matrix.image.name }}-statefulset.yaml - kubectl apply -f pkg/presets/k8s/${{ matrix.image.name }}/${{ matrix.image.name }}-statefulset.yaml + sed -i "s/MASTER_ADDR_HERE/${{ steps.get_ip.outputs.SERVICE_IP }}/g" presets/k8s/${{ matrix.image.name }}/${{ matrix.image.name }}-statefulset.yaml + sed -i "s/TAG_HERE/${{ needs.setup.outputs.IMG_TAG }}/g" presets/k8s/${{ matrix.image.name }}/${{ matrix.image.name }}-statefulset.yaml + sed -i "s/REPO_HERE/${{ steps.get_acr_name.outputs.ACR_NAME }}/g" presets/k8s/${{ matrix.image.name }}/${{ matrix.image.name }}-statefulset.yaml + kubectl apply -f presets/k8s/${{ matrix.image.name }}/${{ matrix.image.name }}-statefulset.yaml - name: Wait for Statefulset to be ready + if: steps.check_image.outputs.IMAGE_EXISTS == 'true' run: | kubectl rollout status statefulset/${{ matrix.image.name }} - name: Test home endpoint + if: steps.check_image.outputs.IMAGE_EXISTS == 'true' run: | curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/ - name: Test healthz endpoint + if: steps.check_image.outputs.IMAGE_EXISTS == 'true' run: | curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/healthz - name: Test inference endpoint + if: steps.check_image.outputs.IMAGE_EXISTS == 'true' run: | if [[ "${{ matrix.image.name }}" == *"llama"* && "${{ matrix.image.name }}" == *"-chat"* ]]; then echo "Testing inference for ${{ matrix.image.name }}" @@ -249,15 +336,29 @@ jobs: - name: Cleanup if: always() run: | - # Delete K8s Service - kubectl delete svc ${{ matrix.image.name }} + # Check and Delete K8s Service if it exists + if kubectl get svc ${{ matrix.image.name }} > /dev/null 2>&1; then + kubectl delete svc ${{ matrix.image.name }} + fi + + # Check and Delete K8s StatefulSet if it exists + if kubectl get statefulset ${{ matrix.image.name }} > /dev/null 2>&1; then + kubectl delete statefulset ${{ matrix.image.name }} + fi - # Delete K8s Deployment - kubectl delete statefulset ${{ matrix.image.name }} + # Check and Delete AKS Nodepool if it exists + if [ -n "${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }}" ]; then + NODEPOOL_EXIST=$(az aks nodepool show \ + --name ${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \ + --cluster-name GitRunner \ + --resource-group llm-test \ + --query 'name' -o tsv || echo "") - # Delete AKS Nodepool - az aks nodepool delete \ - --name ${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \ - --cluster-name new_demo \ - --resource-group llm-test - \ No newline at end of file + if [ -n "$NODEPOOL_EXIST" ]; then + az aks nodepool delete \ + --name ${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \ + --cluster-name GitRunner \ + --resource-group llm-test + fi + fi + \ No newline at end of file diff --git a/.github/workflows/preset-image-build.yml b/.github/workflows/preset-image-build.yml index 3419725cd..a05c3f183 100644 --- a/.github/workflows/preset-image-build.yml +++ b/.github/workflows/preset-image-build.yml @@ -23,7 +23,7 @@ on: default: 'no' image_tag: description: 'Image Tag' - required: false + required: true permissions: id-token: write @@ -37,8 +37,6 @@ jobs: FALCON_MODIFIED: ${{ steps.check_modified_paths.outputs.FALCON_MODIFIED }} LLAMA2_MODIFIED: ${{ steps.check_modified_paths.outputs.LLAMA2_MODIFIED }} LLAMA2_CHAT_MODIFIED: ${{ steps.check_modified_paths.outputs.LLAMA2_CHAT_MODIFIED }} - ACR_NAME: ${{ steps.acr_login.outputs.ACR_NAME }} - POD_NAME: ${{ steps.get_pod_name.outputs.POD_NAME }} steps: - name: Checkout uses: actions/checkout@v4 @@ -87,7 +85,7 @@ jobs: echo "LLAMA2_MODIFIED=$LLAMA2_MODIFIED" >> $GITHUB_OUTPUT echo "LLAMA2_CHAT_MODIFIED=$LLAMA2_CHAT_MODIFIED" >> $GITHUB_OUTPUT - - name: Images to Build + - name: Models to Build run: | echo "FALCON_MODIFIED for this job: ${{ steps.check_modified_paths.outputs.FALCON_MODIFIED }}" echo "LLAMA2_MODIFIED for this job: ${{ steps.check_modified_paths.outputs.LLAMA2_MODIFIED }}" @@ -111,17 +109,20 @@ jobs: run: | echo "image_tag for this job: ${{ steps.set_tag.outputs.image_tag }}" - - name: Save image tag as artifact - id: image_tag_path + - name: Save registry and tag as an artifact for other workflows run: | - echo "${{ steps.set_tag.outputs.image_tag }}" > image_tag.txt - echo "IMAGE_TAG_PATH=$(pwd)/image_tag.txt" >> $GITHUB_OUTPUT - + sudo mkdir -p /tmp/artifacts + sudo chmod 777 /tmp/artifacts + echo ${{ steps.set_tag.outputs.image_tag }} | sudo tee /tmp/artifacts/tag.txt + sudo chmod 666 /tmp/artifacts/tag.txt + # ls -l /tmp/artifacts # Check the permissions of the directory contents + cat /tmp/artifacts/tag.txt + - name: Upload image tag as artifact uses: actions/upload-artifact@v3 with: - name: image-tag-artifact - path: ${{ steps.image_tag_path.outputs.IMAGE_TAG_PATH }} + name: artifacts + path: /tmp/artifacts - name: Install Azure CLI latest run: | @@ -131,7 +132,40 @@ jobs: else echo "Azure CLI already installed." fi - + + matrix_prep: + needs: setup + runs-on: self-hosted + outputs: + matrix: ${{ steps.set_matrix.outputs.matrix }} + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: true + fetch-depth: 0 + + - name: "Set matrix" + id: set_matrix + run: | + matrix=$(jq --arg FALCON_MODIFIED "${{ needs.setup.outputs.FALCON_MODIFIED }}" --arg LLAMA2_MODIFIED "${{ needs.setup.outputs.LLAMA2_MODIFIED }}" --arg LLAMA2_CHAT_MODIFIED "${{ needs.setup.outputs.LLAMA2_CHAT_MODIFIED }}" 'map( + . | select((.shouldBuildFalcon == $FALCON_MODIFIED) or (.shouldBuildLlama2 == $LLAMA2_MODIFIED) or (.shouldBuildLlama2Chat == $LLAMA2_CHAT_MODIFIED)) + )' .github/matrix-configs.json) + echo "matrix={\"include\":$(echo $matrix)}" >> $GITHUB_OUTPUT + + build-models: + needs: [setup, matrix_prep] + runs-on: self-hosted + strategy: + fail-fast: false + matrix: ${{fromJson(needs.matrix_prep.outputs.matrix)}} + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: true + fetch-depth: 0 + - name: 'Az CLI login' uses: azure/login@v1.4.6 with: @@ -145,18 +179,22 @@ jobs: - name: 'Attach and Login to ACR' id: acr_login run: | - if [[ "${{ steps.set_tag.outputs.image_tag }}" == "latest" ]]; then - echo "ACR_NAME=aimodelsregistry" >> $GITHUB_OUTPUT - az aks update -n GitRunner -g llm-test --attach-acr aimodelsregistry - az acr login -n aimodelsregistry --expose-token + if [[ "${{ needs.setup.outputs.image_tag }}" == "latest" ]]; then + ACR_NAME="aimodelsregistry" else - echo "ACR_NAME=aimodelsregistrytest" >> $GITHUB_OUTPUT - az aks update -n GitRunner -g llm-test --attach-acr aimodelsregistrytest - az acr login -n aimodelsregistrytest --expose-token + ACR_NAME="aimodelsregistrytest" fi - - name: Get Context - run: az aks get-credentials -n GitRunner -g llm-test - + + CHECK_ACR_OUTPUT=$(az aks check-acr -g llm-test -n GitRunner --acr $ACR_NAME) + if [[ ! $CHECK_ACR_OUTPUT =~ "Your cluster can pull images from $ACR_NAME.azurecr.io!" ]]; then + az aks update -n GitRunner -g llm-test --attach-acr $ACR_NAME + fi + az acr login -n $ACR_NAME --expose-token + echo "ACR_NAME=$ACR_NAME" >> $GITHUB_OUTPUT + + - name: Get Context + run: az aks get-credentials -n GitRunner -g llm-test + - name: Check if Docker Pod is Running (if not run it) run: | DEPLOYMENT=$(kubectl get deployment docker-deployment -o=jsonpath='{.metadata.name}' --ignore-not-found) @@ -181,86 +219,7 @@ jobs: kubectl exec ${{ steps.get_pod_name.outputs.POD_NAME }} -- \ docker login ${{ steps.acr_login.outputs.ACR_NAME }}.azurecr.io --username ${{ secrets.ACR_AMRT_USERNAME }} --password ${{ secrets.ACR_AMRT_PASSWORD }} fi - - build-models: - needs: setup - runs-on: self-hosted - strategy: - fail-fast: false - matrix: - model: - - name: falcon-7b - dockerfile: docker/presets/falcon/Dockerfile - build_args: "--build-arg FALCON_MODEL_NAME=tiiuae/falcon-7b" - - - name: falcon-7b-instruct - dockerfile: docker/presets/falcon/Dockerfile - build_args: "--build-arg FALCON_MODEL_NAME=tiiuae/falcon-7b-instruct" - - - name: llama-2-7b - dockerfile: docker/presets/llama-2/Dockerfile - build_args: "--build-arg LLAMA_VERSION=llama-2-7b --build-arg SRC_DIR=/home/presets/llama-2" - - - name: llama-2-7b-chat - dockerfile: docker/presets/llama-2/Dockerfile - build_args: "--build-arg LLAMA_VERSION=llama-2-7b-chat --build-arg SRC_DIR=/home/presets/llama-2-chat" - - - name: llama-2-13b - dockerfile: docker/presets/llama-2/Dockerfile - build_args: "--build-arg LLAMA_VERSION=llama-2-13b --build-arg SRC_DIR=/home/presets/llama-2" - - - name: llama-2-13b-chat - dockerfile: docker/presets/llama-2/Dockerfile - build_args: "--build-arg LLAMA_VERSION=llama-2-13b-chat --build-arg SRC_DIR=/home/presets/llama-2-chat" - - # TODO: Support large models - - name: falcon-40b - dockerfile: docker/presets/falcon/Dockerfile - build_args: "--build-arg FALCON_MODEL_NAME=tiiuae/falcon-40b" - - - name: falcon-40b-instruct - dockerfile: docker/presets/falcon/Dockerfile - build_args: "--build-arg FALCON_MODEL_NAME=tiiuae/falcon-40b-instruct" - - - name: llama-2-70b - dockerfile: docker/presets/llama-2/Dockerfile - build_args: "--build-arg LLAMA_VERSION=llama-2-70b --build-arg SRC_DIR=/home/presets/llama-2" - - - name: llama-2-70b-chat - dockerfile: docker/presets/llama-2/Dockerfile - build_args: "--build-arg LLAMA_VERSION=llama-2-70b-chat --build-arg SRC_DIR=/home/presets/llama-2-chat" - - include: - - name: falcon-7b - if: ${{ needs.setup.outputs.FALCON_MODIFIED == 'true' }} - - name: falcon-7b-instruct - if: ${{ needs.setup.outputs.FALCON_MODIFIED == 'true' }} - - name: llama-2-7b - if: ${{ needs.setup.outputs.LLAMA2_MODIFIED == 'true' }} - - name: llama-2-7b-chat - if: ${{ needs.setup.outputs.LLAMA2_CHAT_MODIFIED == 'true' }} - - name: llama-2-13b - if: ${{ needs.setup.outputs.LLAMA2_MODIFIED == 'true' }} - - name: llama-2-13b-chat - if: ${{ needs.setup.outputs.LLAMA2_CHAT_MODIFIED == 'true' }} - # TODO: Support large models - - name: falcon-40b - if: ${{ needs.setup.outputs.FALCON_MODIFIED == 'true' }} - - name: falcon-40b-instruct - if: ${{ needs.setup.outputs.FALCON_MODIFIED == 'true' }} - - - name: llama-2-70b - if: ${{ needs.setup.outputs.LLAMA2_MODIFIED == 'true' }} - - name: llama-2-70b-chat - if: ${{ needs.setup.outputs.LLAMA2_CHAT_MODIFIED == 'true' }} - - steps: - - name: Checkout - uses: actions/checkout@v4 - with: - submodules: true - fetch-depth: 0 - + - name: Build model (with retries) run: | retries=3 @@ -269,10 +228,10 @@ jobs: echo "Docker BUILD_ARGS: $BUILD_ARGS" - kubectl exec ${{ needs.setup.outputs.POD_NAME }} -- \ + kubectl exec ${{ steps.get_pod_name.outputs.POD_NAME }} -- \ docker build \ $BUILD_ARGS \ - -t ${{ needs.setup.outputs.ACR_NAME }}.azurecr.io/${{ matrix.model.name }}:${{ needs.setup.outputs.image_tag }} \ + -t ${{ steps.acr_login.outputs.ACR_NAME }}.azurecr.io/${{ matrix.model.name }}:${{ needs.setup.outputs.image_tag }} \ -f /home/${{ matrix.model.dockerfile }} \ . && break retries=$((retries-1)) @@ -289,8 +248,8 @@ jobs: retries=3 while [ $retries -gt 0 ]; do # Push the Docker image to ACR - kubectl exec ${{ needs.setup.outputs.POD_NAME }} -- \ - docker push ${{ needs.setup.outputs.ACR_NAME }}.azurecr.io/${{ matrix.model.name }}:${{ needs.setup.outputs.image_tag }} + kubectl exec ${{ steps.get_pod_name.outputs.POD_NAME }} -- \ + docker push ${{ steps.acr_login.outputs.ACR_NAME }}.azurecr.io/${{ matrix.model.name }}:${{ needs.setup.outputs.image_tag }} # Check if the push was successful if [ $? -eq 0 ]; then diff --git a/docker/presets/llama-2/Dockerfile b/docker/presets/llama-2/Dockerfile index a7c2910dc..3f091053b 100644 --- a/docker/presets/llama-2/Dockerfile +++ b/docker/presets/llama-2/Dockerfile @@ -23,16 +23,6 @@ RUN pip install 'uvicorn[standard]' ARG LLAMA_VERSION ARG SRC_DIR -# ARG EXTERNAL_IP -# ARG EXTERNAL_PORT -# ARG WEB_SERVER_AUTH_TOKEN -# ENV AUTH_TOKEN_ENV_VAR=${WEB_SERVER_AUTH_TOKEN} - -# Copy Go download script into the Docker image -# COPY /home/docker/presets/download_script.go /workspace/download_script.go - -# Use Go download script to fetch model weights -# RUN go run /workspace/download_script.go "private" ${LLAMA_VERSION} "/workspace/llama/llama-2/weights" ${EXTERNAL_IP} ${EXTERNAL_PORT} ADD /home/llama/${LLAMA_VERSION} /workspace/llama/llama-2/weights ADD ${SRC_DIR} /workspace/llama/llama-2 diff --git a/presets/falcon/inference-api.py b/presets/falcon/inference-api.py index 02efd26c5..9fa5fbaed 100644 --- a/presets/falcon/inference-api.py +++ b/presets/falcon/inference-api.py @@ -12,7 +12,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM import transformers import torch -import torch.distributed as dist +# import torch.distributed as dist parser = argparse.ArgumentParser(description='Falcon Model Configuration') parser.add_argument('--load_in_8bit', default=False, action='store_true', help='Load model in 8-bit mode') diff --git a/presets/k8s/docker.yaml b/presets/k8s/docker.yaml index d07617911..ef4e152a4 100644 --- a/presets/k8s/docker.yaml +++ b/presets/k8s/docker.yaml @@ -36,4 +36,6 @@ spec: - name: falcon-volume hostPath: path: /falcon - type: Directory \ No newline at end of file + type: Directory + nodeSelector: + kubernetes.io/hostname: aks-llamserver-28588359-vmss000000 diff --git a/presets/k8s/falcon-40b-instruct/falcon-40b-instruct-service.yaml b/presets/k8s/falcon-40b-instruct/falcon-40b-instruct-service.yaml new file mode 100644 index 000000000..cc41fb6f7 --- /dev/null +++ b/presets/k8s/falcon-40b-instruct/falcon-40b-instruct-service.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Service +metadata: + name: falcon-40b-instruct +spec: + selector: + app: falcon + statefulset.kubernetes.io/pod-name: falcon-40b-instruct-0 + ports: + - protocol: TCP + port: 80 + targetPort: 5000 + type: LoadBalancer + publishNotReadyAddresses: true diff --git a/presets/k8s/falcon-40b-instruct/falcon-40b-instruct-statefulset.yaml b/presets/k8s/falcon-40b-instruct/falcon-40b-instruct-statefulset.yaml new file mode 100644 index 000000000..0c39ec31e --- /dev/null +++ b/presets/k8s/falcon-40b-instruct/falcon-40b-instruct-statefulset.yaml @@ -0,0 +1,51 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: falcon-40b-instruct +spec: + replicas: 1 + selector: + matchLabels: + app: falcon + podManagementPolicy: Parallel + template: + metadata: + labels: + app: falcon + spec: + containers: + - name: falcon-container + image: REPO_HERE.azurecr.io/falcon-40b-instruct:TAG_HERE + command: + - /bin/sh + - -c + - accelerate launch --config_file config.yaml --num_processes 1 --num_machines 1 --use_deepspeed --machine_rank 0 --gpu_ids all inference-api.py + livenessProbe: + httpGet: + path: /healthz + port: 5000 + initialDelaySeconds: 600 # 10 Min + periodSeconds: 10 + readinessProbe: + httpGet: + path: /healthz + port: 5000 + initialDelaySeconds: 30 + periodSeconds: 10 + volumeMounts: + - name: dshm + mountPath: /dev/shm + volumes: + - name: dshm + emptyDir: + medium: Memory + tolerations: + - effect: NoSchedule + key: sku + operator: Equal + value: gpu + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + nodeSelector: + pool: n40binstruct diff --git a/presets/k8s/falcon-40b/falcon-40b-service.yaml b/presets/k8s/falcon-40b/falcon-40b-service.yaml new file mode 100644 index 000000000..599f70ca5 --- /dev/null +++ b/presets/k8s/falcon-40b/falcon-40b-service.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Service +metadata: + name: falcon-40b +spec: + selector: + app: falcon + statefulset.kubernetes.io/pod-name: falcon-40b-0 + ports: + - protocol: TCP + port: 80 + targetPort: 5000 + type: LoadBalancer + publishNotReadyAddresses: true diff --git a/presets/k8s/falcon-40b/falcon-40b-statefulset.yaml b/presets/k8s/falcon-40b/falcon-40b-statefulset.yaml new file mode 100644 index 000000000..c213867df --- /dev/null +++ b/presets/k8s/falcon-40b/falcon-40b-statefulset.yaml @@ -0,0 +1,51 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: falcon-40b +spec: + replicas: 1 + selector: + matchLabels: + app: falcon + podManagementPolicy: Parallel + template: + metadata: + labels: + app: falcon + spec: + containers: + - name: falcon-container + image: REPO_HERE.azurecr.io/falcon-40b:TAG_HERE + command: + - /bin/sh + - -c + - accelerate launch --config_file config.yaml --num_processes 1 --num_machines 1 --use_deepspeed --machine_rank 0 --gpu_ids all inference-api.py + livenessProbe: + httpGet: + path: /healthz + port: 5000 + initialDelaySeconds: 600 # 10 Min + periodSeconds: 10 + readinessProbe: + httpGet: + path: /healthz + port: 5000 + initialDelaySeconds: 30 + periodSeconds: 10 + volumeMounts: + - name: dshm + mountPath: /dev/shm + volumes: + - name: dshm + emptyDir: + medium: Memory + tolerations: + - effect: NoSchedule + key: sku + operator: Equal + value: gpu + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + nodeSelector: + pool: falcon40b