diff --git a/.github/matrix-configs.json b/.github/matrix-configs.json
new file mode 100644
index 000000000..a1be0b14b
--- /dev/null
+++ b/.github/matrix-configs.json
@@ -0,0 +1,100 @@
+[
+    {
+        "model": {
+            "runs_on": "self-hosted",
+            "name": "falcon-7b",
+            "dockerfile": "docker/presets/falcon/Dockerfile",
+            "build_args": "--build-arg FALCON_MODEL_NAME=tiiuae/falcon-7b"
+        },
+        "shouldBuildFalcon": "true"
+    },
+    {
+        "model": {
+            "runs_on": "self-hosted",
+            "name": "falcon-7b-instruct",
+            "dockerfile": "docker/presets/falcon/Dockerfile",
+            "build_args": "--build-arg FALCON_MODEL_NAME=tiiuae/falcon-7b-instruct"
+        },
+        "shouldBuildFalcon": "true"
+    },
+
+    {
+        "model": {
+            "runs_on": "self-hosted",
+            "name": "falcon-40b",
+            "dockerfile": "docker/presets/falcon/Dockerfile",
+            "build_args": "--build-arg FALCON_MODEL_NAME=tiiuae/falcon-40b"
+        },
+        "shouldBuildFalcon": "true"
+    },
+
+    {
+        "model": {
+            "runs_on": "self-hosted",
+            "name": "falcon-40b-instruct",
+            "dockerfile": "docker/presets/falcon/Dockerfile",
+            "build_args": "--build-arg FALCON_MODEL_NAME=tiiuae/falcon-40b-instruct"
+        },
+        "shouldBuildFalcon": "true"
+    },
+
+    {
+        "model": {
+            "runs_on": "self-hosted",
+            "name": "llama-2-7b",
+            "dockerfile": "docker/presets/llama-2/Dockerfile",
+            "build_args": "--build-arg LLAMA_VERSION=llama-2-7b --build-arg SRC_DIR=/home/presets/llama-2"
+        },
+        "shouldBuildLlama2": "true"
+    },
+
+    {
+        "model": {
+            "runs_on": "self-hosted",
+            "name": "llama-2-13b",
+            "dockerfile": "docker/presets/llama-2/Dockerfile",
+            "build_args": "--build-arg LLAMA_VERSION=llama-2-13b --build-arg SRC_DIR=/home/presets/llama-2"
+        },
+        "shouldBuildLlama2": "true"
+    },
+
+    {
+        "model": {
+            "runs_on": "self-hosted",
+            "name": "llama-2-70b",
+            "dockerfile": "docker/presets/llama-2/Dockerfile",
+            "build_args": "--build-arg LLAMA_VERSION=llama-2-70b --build-arg SRC_DIR=/home/presets/llama-2"
+        },
+        "shouldBuildLlama2": "true"
+    },
+
+    {
+        "model": {
+            "runs_on": "self-hosted",
+            "name": "llama-2-7b-chat",
+            "dockerfile": "docker/presets/llama-2/Dockerfile",
+            "build_args": "--build-arg LLAMA_VERSION=llama-2-7b-chat --build-arg SRC_DIR=/home/presets/llama-2-chat"
+        },
+        "shouldBuildLlama2Chat": "true"
+    },
+
+    {
+        "model": {
+            "runs_on": "self-hosted",
+            "name": "llama-2-13b-chat",
+            "dockerfile": "docker/presets/llama-2/Dockerfile",
+            "build_args": "--build-arg LLAMA_VERSION=llama-2-13b-chat --build-arg SRC_DIR=/home/presets/llama-2-chat"
+        },
+        "shouldBuildLlama2Chat": "true"
+    },
+
+    {
+        "model": {
+            "runs_on": "self-hosted",
+            "name": "llama-2-70b-chat",
+            "dockerfile": "docker/presets/llama-2/Dockerfile",
+            "build_args": "--build-arg LLAMA_VERSION=llama-2-70b-chat --build-arg SRC_DIR=/home/presets/llama-2-chat"
+        },
+        "shouldBuildLlama2Chat": "true"
+    }
+]
diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml
index ad9a42109..39f1cf515 100644
--- a/.github/workflows/e2e-preset-test.yml
+++ b/.github/workflows/e2e-preset-test.yml
@@ -5,8 +5,11 @@ on:
         workflows: ["Build and Push Preset Models"]
         types: 
             - completed
-    workflow_dispatch: {}
-
+    workflow_dispatch:
+        inputs:
+            image_tag:
+                description: 'Image Tag'
+                required: true
 env:
     GO_VERSION: "1.20"
 
@@ -15,7 +18,70 @@ permissions:
     contents: read
 
 jobs:
+  setup:
+    if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success'
+    runs-on: self-hosted
+    outputs: 
+        IMG_TAG: ${{ steps.set_final_tag.outputs.IMG_TAG }}
+    steps: 
+      - name: Determine tag from dispatch
+        if: github.event_name == 'workflow_dispatch'
+        id: determine_tag
+        run: echo "IMG_TAG=${{ github.event.inputs.image_tag }}" >> $GITHUB_OUTPUT
+
+      - name: Download tag artifact
+        if: github.event_name == 'workflow_run'
+        uses: actions/github-script@v6
+        with:
+            github-token: ${{ secrets.KAITO_ACCESS_TOKEN_READ }}
+            script: |
+                let allArtifacts = await github.rest.actions.listWorkflowRunArtifacts({
+                    owner: context.repo.owner,
+                    repo: context.repo.repo,
+                    run_id: context.payload.workflow_run.id,
+                });
+                let matchArtifact = allArtifacts.data.artifacts.filter((artifact) => {
+                    return artifact.name == "artifacts"
+                })[0];
+                let download = await github.rest.actions.downloadArtifact({
+                    owner: context.repo.owner,
+                    repo: context.repo.repo,
+                    artifact_id: matchArtifact.id,
+                    archive_format: 'zip',
+                });
+                let fs = require('fs');
+                fs.writeFileSync(`/tmp/artifacts.zip`, Buffer.from(download.data));
+      - name: Unzip tag artifact
+        if: github.event_name == 'workflow_run'
+        run: |
+          mkdir -p /tmp/artifacts
+          unzip /tmp/artifacts.zip -d /tmp/artifacts
+        shell: bash
+      - name: Display downloaded aritifacts
+        if: github.event_name == 'workflow_run'
+        run: |
+          echo "Downloaded artifacts:"
+          ls -ablh /tmp/artifacts
+        shell: bash
+      - name: Parse artifacts and assign GA environment variables
+        if: github.event_name == 'workflow_run'
+        id: get_image_tag
+        run: |
+            tag=$(tail -n 1 /tmp/artifacts/tag.txt)
+            echo "IMG_TAG=$tag" >> $GITHUB_OUTPUT
+
+      - name: Set final image tag
+        id: set_final_tag
+        run: |
+            if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
+                echo "IMG_TAG=${{ steps.determine_tag.outputs.IMG_TAG }}" >> $GITHUB_OUTPUT
+            else
+                echo "IMG_TAG=${{ steps.get_image_tag.outputs.IMG_TAG }}" >> $GITHUB_OUTPUT
+            fi
+
   e2e-preset-tests:
+    if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success'
+    needs: setup
     runs-on: self-hosted
     strategy:
       fail-fast: false
@@ -26,49 +92,48 @@ jobs:
             node-vm-size: Standard_NC12s_v3
             node-osdisk-size: 100
 
-        #   - name: falcon-7b-instruct
-        #     node-count: 1
-        #     node-vm-size: Standard_NC12s_v3
-        #     node-osdisk-size: 100
+          - name: falcon-7b-instruct
+            node-count: 1
+            node-vm-size: Standard_NC12s_v3
+            node-osdisk-size: 100
 
-        # Uncomment once supported by ACR
-        #   - name: falcon-40b
-        #     node-count: 1
-        #     node-vm-size: Standard_NC96ads_A100_v4
-        #     node-osdisk-size: 400
+          - name: falcon-40b
+            node-count: 1
+            node-vm-size: Standard_NC96ads_A100_v4
+            node-osdisk-size: 400
 
-        #   - name: falcon-40b-instruct
-        #     node-count: 1
-        #     node-vm-size: Standard_NC96ads_A100_v4
-        #     node-osdisk-size: 400
+          - name: falcon-40b-instruct
+            node-count: 1
+            node-vm-size: Standard_NC96ads_A100_v4
+            node-osdisk-size: 400
 
-        #   - name: llama-2-7b
-        #     node-count: 1
-        #     node-vm-size: Standard_NC12s_v3
-        #     node-osdisk-size: 100
+          - name: llama-2-7b
+            node-count: 1
+            node-vm-size: Standard_NC12s_v3
+            node-osdisk-size: 100
         
-        #   - name: llama-2-13b
-        #     node-count: 2
-        #     node-vm-size: Standard_NC12s_v3
-        #     node-osdisk-size: 150
+          - name: llama-2-13b
+            node-count: 2
+            node-vm-size: Standard_NC12s_v3
+            node-osdisk-size: 150
         
-        # Uncomment once supported by ACR
+        # Uncomment once service/deployment made
         #   - name: llama-2-70b
         #     node-count: 2
         #     node-vm-size: Standard_NC96ads_A100_v4
         #     node-osdisk-size: 400
 
-        #   - name: llama-2-7b-chat
-        #     node-count: 1
-        #     node-vm-size: Standard_NC12s_v3
-        #     node-osdisk-size: 100
+          - name: llama-2-7b-chat
+            node-count: 1
+            node-vm-size: Standard_NC12s_v3
+            node-osdisk-size: 100
 
-        #   - name: llama-2-13b-chat
-        #     node-count: 2
-        #     node-vm-size: Standard_NC12s_v3
-        #     node-osdisk-size: 150
+          - name: llama-2-13b-chat
+            node-count: 2
+            node-vm-size: Standard_NC12s_v3
+            node-osdisk-size: 150
         
-        # Uncomment once supported by ACR
+        # Uncomment once service/deployment made
         #   - name: llama-2-70b-chat
         #     node-count: 2
         #     node-vm-size: Standard_NC96ads_A100_v4
@@ -80,21 +145,12 @@ jobs:
         with:
             submodules: true
             fetch-depth: 0
-      
-      - name: Download image tag artifact 
-        uses: actions/download-artifact@v3
-        id: download
-        with:
-            name: image-tag-artifact
-
-      - name: Read image tag
-        id: get_image_tag
+    
+      - name: Get ACR Name
+        id: get_acr_name
         run: |
-            tag=$(cat image_tag.txt)
-            echo "IMAGE_TAG=$tag" >> $GITHUB_OUTPUT
-
             # Set the ACR based on the tag value
-            if [[ "$tag" == "latest" ]]; then
+            if [[ "${{ needs.setup.outputs.IMG_TAG }}" == "latest" ]]; then
               echo "ACR_NAME=aimodelsregistry" >> $GITHUB_OUTPUT
             else
               echo "ACR_NAME=aimodelsregistrytest" >> $GITHUB_OUTPUT
@@ -102,23 +158,46 @@ jobs:
         
       - name: Install Azure CLI latest
         run: |
-            curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
+            if ! which az > /dev/null; then
+                echo "Azure CLI not found. Installing..."
+                curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
+            else
+                echo "Azure CLI already installed."
+            fi
 
-      - name: Az CLI login
+      - name: 'Az CLI login'
         uses: azure/login@v1.4.6
         with:
-            client-id: ${{ secrets.AZURE_KDM_PRESET_CLIENT_ID }}
+            client-id: ${{ secrets.AZURE_KDM_PRESET_SELF_RUNNER_CLIENT_ID }}
             tenant-id: ${{ secrets.AZURE_TENANT_ID }}
-            subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
+            allow-no-subscriptions: true
+      
+      - name: 'Set subscription'
+        run: az account set --subscription ${{secrets.AZURE_SUBSCRIPTION_ID}}
 
-      - name: 'Login to ACR'
-        run: az acr login --name ${{ steps.get_image_tag.outputs.ACR_NAME }}
+      - name: 'Check if Image exists in ACR'
+        id: check_image
+        run: |
+            ACR_NAME=${{ steps.get_acr_name.outputs.ACR_NAME }}
+            IMAGE_NAME=${{ matrix.image.name }}
+            TAG=${{ needs.setup.outputs.IMG_TAG }}
+        
+            TAGS=$(az acr repository show-tags -n $ACR_NAME --repository $IMAGE_NAME --output tsv)
+        
+            if echo "$TAGS" | grep -q "^$TAG$"; then
+                echo "IMAGE_EXISTS=true" >> $GITHUB_OUTPUT
+            else
+                echo "IMAGE_EXISTS=false" >> $GITHUB_OUTPUT
+                echo "Image $IMAGE_NAME:$TAG not found in $ACR_NAME."
+            fi
 
       - name: Set up kubectl context
+        if: steps.check_image.outputs.IMAGE_EXISTS == 'true'
         run: |
-          az aks get-credentials --resource-group llm-test --name new_demo
+          az aks get-credentials --resource-group llm-test --name GitRunner
     
       - name: Get Nodepool Name
+        if: steps.check_image.outputs.IMAGE_EXISTS == 'true'
         id: get_nodepool_name
         run: |
             NAME_SUFFIX=${{ matrix.image.name }}
@@ -133,17 +212,18 @@ jobs:
             echo "NODEPOOL_NAME=$TRUNCATED_NAME_SUFFIX" >> $GITHUB_OUTPUT
 
       - name: Create Nodepool
+        if: steps.check_image.outputs.IMAGE_EXISTS == 'true'
         run: |
             NODEPOOL_EXIST=$(az aks nodepool show \
                             --name ${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \
-                            --cluster-name new_demo \
+                            --cluster-name GitRunner \
                             --resource-group llm-test \
                             --query 'name' -o tsv || echo "")
             echo "NODEPOOL_EXIST: $NODEPOOL_EXIST"
             if [ -z "$NODEPOOL_EXIST" ]; then
                 az aks nodepool add \
                     --name ${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \
-                    --cluster-name new_demo \
+                    --cluster-name GitRunner \
                     --resource-group llm-test \
                     --node-count ${{ matrix.image.node-count }} \
                     --node-vm-size ${{ matrix.image.node-vm-size }} \
@@ -154,7 +234,7 @@ jobs:
             else
                 NODEPOOL_STATE=$(az aks nodepool show \
                                 --name ${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \
-                                --cluster-name new_demo \
+                                --cluster-name GitRunner \
                                 --resource-group llm-test \
                                 --query 'provisioningState' -o tsv)
                 echo "NODEPOOL_STATE: $NODEPOOL_STATE"          
@@ -167,9 +247,11 @@ jobs:
             fi
 
       - name: Create Service
-        run: kubectl apply -f pkg/presets/k8s/${{ matrix.image.name }}/${{ matrix.image.name }}-service.yaml
+        if: steps.check_image.outputs.IMAGE_EXISTS == 'true'
+        run: kubectl apply -f presets/k8s/${{ matrix.image.name }}/${{ matrix.image.name }}-service.yaml
       
       - name: Retrieve External Service IP
+        if: steps.check_image.outputs.IMAGE_EXISTS == 'true'
         id: get_ip
         run: |
             while [[ -z $SERVICE_IP ]]; do 
@@ -180,25 +262,30 @@ jobs:
             echo "SERVICE_IP=$SERVICE_IP" >> $GITHUB_OUTPUT
         
       - name: Replace IP and Deploy Statefulset to K8s
+        if: steps.check_image.outputs.IMAGE_EXISTS == 'true'
         run: |
-            sed -i "s/MASTER_ADDR_HERE/${{ steps.get_ip.outputs.SERVICE_IP }}/g" pkg/presets/k8s/${{ matrix.image.name }}/${{ matrix.image.name }}-statefulset.yaml
-            sed -i "s/TAG_HERE/${{ steps.get_image_tag.outputs.IMAGE_TAG }}/g" pkg/presets/k8s/${{ matrix.image.name }}/${{ matrix.image.name }}-statefulset.yaml
-            sed -i "s/REPO_HERE/${{ steps.get_image_tag.outputs.ACR_NAME }}/g" pkg/presets/k8s/${{ matrix.image.name }}/${{ matrix.image.name }}-statefulset.yaml
-            kubectl apply -f pkg/presets/k8s/${{ matrix.image.name }}/${{ matrix.image.name }}-statefulset.yaml
+            sed -i "s/MASTER_ADDR_HERE/${{ steps.get_ip.outputs.SERVICE_IP }}/g" presets/k8s/${{ matrix.image.name }}/${{ matrix.image.name }}-statefulset.yaml
+            sed -i "s/TAG_HERE/${{ needs.setup.outputs.IMG_TAG }}/g" presets/k8s/${{ matrix.image.name }}/${{ matrix.image.name }}-statefulset.yaml
+            sed -i "s/REPO_HERE/${{ steps.get_acr_name.outputs.ACR_NAME }}/g" presets/k8s/${{ matrix.image.name }}/${{ matrix.image.name }}-statefulset.yaml
+            kubectl apply -f presets/k8s/${{ matrix.image.name }}/${{ matrix.image.name }}-statefulset.yaml
     
       - name: Wait for Statefulset to be ready
+        if: steps.check_image.outputs.IMAGE_EXISTS == 'true'
         run: |
             kubectl rollout status statefulset/${{ matrix.image.name }}
         
       - name: Test home endpoint
+        if: steps.check_image.outputs.IMAGE_EXISTS == 'true'
         run: |
             curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/
 
       - name: Test healthz endpoint
+        if: steps.check_image.outputs.IMAGE_EXISTS == 'true'
         run: |
             curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/healthz
     
       - name: Test inference endpoint
+        if: steps.check_image.outputs.IMAGE_EXISTS == 'true'
         run: |
             if [[ "${{ matrix.image.name }}" == *"llama"* && "${{ matrix.image.name }}" == *"-chat"* ]]; then
                 echo "Testing inference for ${{ matrix.image.name }}"
@@ -249,15 +336,29 @@ jobs:
       - name: Cleanup
         if: always()
         run: |
-            # Delete K8s Service
-            kubectl delete svc ${{ matrix.image.name }}
+            # Check and Delete K8s Service if it exists
+            if kubectl get svc ${{ matrix.image.name }} > /dev/null 2>&1; then
+                kubectl delete svc ${{ matrix.image.name }}
+            fi
+        
+            # Check and Delete K8s StatefulSet if it exists
+            if kubectl get statefulset ${{ matrix.image.name }} > /dev/null 2>&1; then
+                kubectl delete statefulset ${{ matrix.image.name }}
+            fi
 
-            # Delete K8s Deployment
-            kubectl delete statefulset ${{ matrix.image.name }}
+            # Check and Delete AKS Nodepool if it exists            
+            if [ -n "${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }}" ]; then
+                NODEPOOL_EXIST=$(az aks nodepool show \
+                                --name ${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \
+                                --cluster-name GitRunner \
+                                --resource-group llm-test \
+                                --query 'name' -o tsv || echo "")
 
-            # Delete AKS Nodepool
-            az aks nodepool delete \
-            --name ${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \
-            --cluster-name new_demo \
-            --resource-group llm-test
-        
\ No newline at end of file
+                if [ -n "$NODEPOOL_EXIST" ]; then
+                    az aks nodepool delete \
+                    --name ${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \
+                    --cluster-name GitRunner \
+                    --resource-group llm-test
+                fi
+            fi
+          
\ No newline at end of file
diff --git a/.github/workflows/preset-image-build.yml b/.github/workflows/preset-image-build.yml
index 3419725cd..a05c3f183 100644
--- a/.github/workflows/preset-image-build.yml
+++ b/.github/workflows/preset-image-build.yml
@@ -23,7 +23,7 @@ on:
         default: 'no'
       image_tag:
         description: 'Image Tag'
-        required: false
+        required: true
 
 permissions:
   id-token: write
@@ -37,8 +37,6 @@ jobs:
       FALCON_MODIFIED: ${{ steps.check_modified_paths.outputs.FALCON_MODIFIED }}
       LLAMA2_MODIFIED: ${{ steps.check_modified_paths.outputs.LLAMA2_MODIFIED }}
       LLAMA2_CHAT_MODIFIED: ${{ steps.check_modified_paths.outputs.LLAMA2_CHAT_MODIFIED }}
-      ACR_NAME: ${{ steps.acr_login.outputs.ACR_NAME }}
-      POD_NAME: ${{ steps.get_pod_name.outputs.POD_NAME }}
     steps:
       - name: Checkout
         uses: actions/checkout@v4
@@ -87,7 +85,7 @@ jobs:
           echo "LLAMA2_MODIFIED=$LLAMA2_MODIFIED" >> $GITHUB_OUTPUT
           echo "LLAMA2_CHAT_MODIFIED=$LLAMA2_CHAT_MODIFIED" >> $GITHUB_OUTPUT
 
-      - name: Images to Build
+      - name: Models to Build
         run: |
           echo "FALCON_MODIFIED for this job: ${{ steps.check_modified_paths.outputs.FALCON_MODIFIED }}"
           echo "LLAMA2_MODIFIED for this job: ${{ steps.check_modified_paths.outputs.LLAMA2_MODIFIED }}"
@@ -111,17 +109,20 @@ jobs:
         run: |
           echo "image_tag for this job: ${{ steps.set_tag.outputs.image_tag }}"
       
-      - name: Save image tag as artifact
-        id: image_tag_path
+      - name: Save registry and tag as an artifact for other workflows
         run: |
-          echo "${{ steps.set_tag.outputs.image_tag }}" > image_tag.txt
-          echo "IMAGE_TAG_PATH=$(pwd)/image_tag.txt" >> $GITHUB_OUTPUT
-      
+          sudo mkdir -p /tmp/artifacts
+          sudo chmod 777 /tmp/artifacts
+          echo ${{ steps.set_tag.outputs.image_tag }} | sudo tee /tmp/artifacts/tag.txt
+          sudo chmod 666 /tmp/artifacts/tag.txt
+          # ls -l /tmp/artifacts  # Check the permissions of the directory contents
+          cat /tmp/artifacts/tag.txt
+
       - name: Upload image tag as artifact
         uses: actions/upload-artifact@v3
         with:
-          name: image-tag-artifact
-          path: ${{ steps.image_tag_path.outputs.IMAGE_TAG_PATH }}
+          name: artifacts
+          path: /tmp/artifacts
       
       - name: Install Azure CLI latest
         run: |
@@ -131,7 +132,40 @@ jobs:
           else
             echo "Azure CLI already installed."
           fi
-      
+  
+  matrix_prep:
+    needs: setup
+    runs-on: self-hosted
+    outputs:
+      matrix: ${{ steps.set_matrix.outputs.matrix }}
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v4
+      with:
+        submodules: true
+        fetch-depth: 0
+
+    - name: "Set matrix"
+      id: set_matrix
+      run: |
+        matrix=$(jq --arg FALCON_MODIFIED "${{ needs.setup.outputs.FALCON_MODIFIED }}" --arg LLAMA2_MODIFIED "${{ needs.setup.outputs.LLAMA2_MODIFIED }}" --arg LLAMA2_CHAT_MODIFIED "${{ needs.setup.outputs.LLAMA2_CHAT_MODIFIED }}" 'map( 
+            . | select((.shouldBuildFalcon == $FALCON_MODIFIED) or (.shouldBuildLlama2 == $LLAMA2_MODIFIED) or (.shouldBuildLlama2Chat == $LLAMA2_CHAT_MODIFIED))
+        )' .github/matrix-configs.json) 
+        echo "matrix={\"include\":$(echo $matrix)}" >> $GITHUB_OUTPUT
+
+  build-models: 
+    needs: [setup, matrix_prep]
+    runs-on: self-hosted
+    strategy:
+      fail-fast: false
+      matrix: ${{fromJson(needs.matrix_prep.outputs.matrix)}}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+          fetch-depth: 0
+
       - name: 'Az CLI login'
         uses: azure/login@v1.4.6
         with:
@@ -145,18 +179,22 @@ jobs:
       - name: 'Attach and Login to ACR'
         id: acr_login
         run: |
-          if [[ "${{ steps.set_tag.outputs.image_tag }}" == "latest" ]]; then
-            echo "ACR_NAME=aimodelsregistry" >> $GITHUB_OUTPUT
-            az aks update -n GitRunner -g llm-test --attach-acr aimodelsregistry
-            az acr login -n aimodelsregistry --expose-token
+          if [[ "${{ needs.setup.outputs.image_tag }}" == "latest" ]]; then
+            ACR_NAME="aimodelsregistry"
           else
-            echo "ACR_NAME=aimodelsregistrytest" >> $GITHUB_OUTPUT
-            az aks update -n GitRunner -g llm-test --attach-acr aimodelsregistrytest
-            az acr login -n aimodelsregistrytest --expose-token
+            ACR_NAME="aimodelsregistrytest"
           fi
-          - name: Get Context
-          run: az aks get-credentials -n GitRunner -g llm-test
-  
+          
+          CHECK_ACR_OUTPUT=$(az aks check-acr -g llm-test -n GitRunner --acr $ACR_NAME)
+          if [[ ! $CHECK_ACR_OUTPUT =~ "Your cluster can pull images from $ACR_NAME.azurecr.io!" ]]; then
+            az aks update -n GitRunner -g llm-test --attach-acr $ACR_NAME
+          fi
+          az acr login -n $ACR_NAME --expose-token
+          echo "ACR_NAME=$ACR_NAME" >> $GITHUB_OUTPUT
+
+      - name: Get Context
+        run: az aks get-credentials -n GitRunner -g llm-test
+
       - name: Check if Docker Pod is Running (if not run it)
         run: |
           DEPLOYMENT=$(kubectl get deployment docker-deployment -o=jsonpath='{.metadata.name}' --ignore-not-found)
@@ -181,86 +219,7 @@ jobs:
             kubectl exec ${{ steps.get_pod_name.outputs.POD_NAME }} -- \
             docker login ${{ steps.acr_login.outputs.ACR_NAME }}.azurecr.io --username ${{ secrets.ACR_AMRT_USERNAME }} --password ${{ secrets.ACR_AMRT_PASSWORD }}
           fi
-        
-  build-models: 
-    needs: setup
-    runs-on: self-hosted
-    strategy:
-      fail-fast: false
-      matrix: 
-        model:
-          - name: falcon-7b
-            dockerfile: docker/presets/falcon/Dockerfile
-            build_args: "--build-arg FALCON_MODEL_NAME=tiiuae/falcon-7b"
-
-          - name: falcon-7b-instruct
-            dockerfile: docker/presets/falcon/Dockerfile
-            build_args: "--build-arg FALCON_MODEL_NAME=tiiuae/falcon-7b-instruct"
-
-          - name: llama-2-7b
-            dockerfile: docker/presets/llama-2/Dockerfile
-            build_args: "--build-arg LLAMA_VERSION=llama-2-7b --build-arg SRC_DIR=/home/presets/llama-2"
-          
-          - name: llama-2-7b-chat
-            dockerfile: docker/presets/llama-2/Dockerfile
-            build_args: "--build-arg LLAMA_VERSION=llama-2-7b-chat --build-arg SRC_DIR=/home/presets/llama-2-chat"
-          
-          - name: llama-2-13b
-            dockerfile: docker/presets/llama-2/Dockerfile
-            build_args: "--build-arg LLAMA_VERSION=llama-2-13b --build-arg SRC_DIR=/home/presets/llama-2"
-          
-          - name: llama-2-13b-chat
-            dockerfile: docker/presets/llama-2/Dockerfile
-            build_args: "--build-arg LLAMA_VERSION=llama-2-13b-chat --build-arg SRC_DIR=/home/presets/llama-2-chat"
-        
-          # TODO: Support large models
-          - name: falcon-40b
-            dockerfile: docker/presets/falcon/Dockerfile
-            build_args: "--build-arg FALCON_MODEL_NAME=tiiuae/falcon-40b"
-
-          - name: falcon-40b-instruct
-            dockerfile: docker/presets/falcon/Dockerfile
-            build_args: "--build-arg FALCON_MODEL_NAME=tiiuae/falcon-40b-instruct"
-
-          - name: llama-2-70b
-            dockerfile: docker/presets/llama-2/Dockerfile
-            build_args: "--build-arg LLAMA_VERSION=llama-2-70b --build-arg SRC_DIR=/home/presets/llama-2"
-          
-          - name: llama-2-70b-chat
-            dockerfile: docker/presets/llama-2/Dockerfile
-            build_args: "--build-arg LLAMA_VERSION=llama-2-70b-chat --build-arg SRC_DIR=/home/presets/llama-2-chat"
-
-        include:
-          - name: falcon-7b
-            if: ${{ needs.setup.outputs.FALCON_MODIFIED == 'true' }}
-          - name: falcon-7b-instruct
-            if: ${{ needs.setup.outputs.FALCON_MODIFIED == 'true' }}
-          - name: llama-2-7b
-            if: ${{ needs.setup.outputs.LLAMA2_MODIFIED == 'true' }}
-          - name: llama-2-7b-chat
-            if: ${{ needs.setup.outputs.LLAMA2_CHAT_MODIFIED == 'true' }}
-          - name: llama-2-13b
-            if: ${{ needs.setup.outputs.LLAMA2_MODIFIED == 'true' }}
-          - name: llama-2-13b-chat
-            if: ${{ needs.setup.outputs.LLAMA2_CHAT_MODIFIED == 'true' }}
-          # TODO: Support large models
-          - name: falcon-40b
-            if: ${{ needs.setup.outputs.FALCON_MODIFIED == 'true' }}
-          - name: falcon-40b-instruct
-            if: ${{ needs.setup.outputs.FALCON_MODIFIED == 'true' }}
-          
-          - name: llama-2-70b
-            if: ${{ needs.setup.outputs.LLAMA2_MODIFIED == 'true' }}
-          - name: llama-2-70b-chat
-            if: ${{ needs.setup.outputs.LLAMA2_CHAT_MODIFIED == 'true' }}
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          submodules: true
-          fetch-depth: 0
-
+      
       - name: Build model (with retries)
         run: |
           retries=3
@@ -269,10 +228,10 @@ jobs:
 
             echo "Docker BUILD_ARGS: $BUILD_ARGS"
 
-            kubectl exec ${{ needs.setup.outputs.POD_NAME }} -- \
+            kubectl exec ${{ steps.get_pod_name.outputs.POD_NAME }} -- \
             docker build \
             $BUILD_ARGS \
-            -t ${{ needs.setup.outputs.ACR_NAME  }}.azurecr.io/${{ matrix.model.name }}:${{ needs.setup.outputs.image_tag }} \
+            -t ${{ steps.acr_login.outputs.ACR_NAME }}.azurecr.io/${{ matrix.model.name }}:${{ needs.setup.outputs.image_tag }} \
             -f /home/${{ matrix.model.dockerfile }} \
             . && break
             retries=$((retries-1))
@@ -289,8 +248,8 @@ jobs:
           retries=3
           while [ $retries -gt 0 ]; do
             # Push the Docker image to ACR
-            kubectl exec ${{ needs.setup.outputs.POD_NAME }} -- \
-            docker push ${{ needs.setup.outputs.ACR_NAME }}.azurecr.io/${{ matrix.model.name }}:${{ needs.setup.outputs.image_tag }}
+            kubectl exec ${{ steps.get_pod_name.outputs.POD_NAME }} -- \
+            docker push ${{ steps.acr_login.outputs.ACR_NAME }}.azurecr.io/${{ matrix.model.name }}:${{ needs.setup.outputs.image_tag }}
             
             # Check if the push was successful
             if [ $? -eq 0 ]; then
diff --git a/docker/presets/llama-2/Dockerfile b/docker/presets/llama-2/Dockerfile
index a7c2910dc..3f091053b 100644
--- a/docker/presets/llama-2/Dockerfile
+++ b/docker/presets/llama-2/Dockerfile
@@ -23,16 +23,6 @@ RUN pip install 'uvicorn[standard]'
 
 ARG LLAMA_VERSION
 ARG SRC_DIR
-# ARG EXTERNAL_IP
-# ARG EXTERNAL_PORT
-# ARG WEB_SERVER_AUTH_TOKEN
 
-# ENV AUTH_TOKEN_ENV_VAR=${WEB_SERVER_AUTH_TOKEN}
-
-# Copy Go download script into the Docker image
-# COPY /home/docker/presets/download_script.go /workspace/download_script.go
-
-# Use Go download script to fetch model weights
-# RUN go run /workspace/download_script.go "private" ${LLAMA_VERSION} "/workspace/llama/llama-2/weights" ${EXTERNAL_IP} ${EXTERNAL_PORT}
 ADD /home/llama/${LLAMA_VERSION} /workspace/llama/llama-2/weights
 ADD ${SRC_DIR} /workspace/llama/llama-2
diff --git a/presets/falcon/inference-api.py b/presets/falcon/inference-api.py
index 02efd26c5..9fa5fbaed 100644
--- a/presets/falcon/inference-api.py
+++ b/presets/falcon/inference-api.py
@@ -12,7 +12,7 @@
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import transformers
 import torch
-import torch.distributed as dist
+# import torch.distributed as dist
 
 parser = argparse.ArgumentParser(description='Falcon Model Configuration')
 parser.add_argument('--load_in_8bit', default=False, action='store_true', help='Load model in 8-bit mode')
diff --git a/presets/k8s/docker.yaml b/presets/k8s/docker.yaml
index d07617911..ef4e152a4 100644
--- a/presets/k8s/docker.yaml
+++ b/presets/k8s/docker.yaml
@@ -36,4 +36,6 @@ spec:
       - name: falcon-volume
         hostPath:
           path: /falcon
-          type: Directory
\ No newline at end of file
+          type: Directory
+      nodeSelector:
+        kubernetes.io/hostname: aks-llamserver-28588359-vmss000000
diff --git a/presets/k8s/falcon-40b-instruct/falcon-40b-instruct-service.yaml b/presets/k8s/falcon-40b-instruct/falcon-40b-instruct-service.yaml
new file mode 100644
index 000000000..cc41fb6f7
--- /dev/null
+++ b/presets/k8s/falcon-40b-instruct/falcon-40b-instruct-service.yaml
@@ -0,0 +1,14 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: falcon-40b-instruct
+spec:
+  selector:
+    app: falcon
+    statefulset.kubernetes.io/pod-name: falcon-40b-instruct-0
+  ports:
+    - protocol: TCP
+      port: 80
+      targetPort: 5000
+  type: LoadBalancer
+  publishNotReadyAddresses: true
diff --git a/presets/k8s/falcon-40b-instruct/falcon-40b-instruct-statefulset.yaml b/presets/k8s/falcon-40b-instruct/falcon-40b-instruct-statefulset.yaml
new file mode 100644
index 000000000..0c39ec31e
--- /dev/null
+++ b/presets/k8s/falcon-40b-instruct/falcon-40b-instruct-statefulset.yaml
@@ -0,0 +1,51 @@
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: falcon-40b-instruct
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: falcon
+  podManagementPolicy: Parallel
+  template:
+    metadata:
+      labels:
+        app: falcon
+    spec:
+      containers:
+        - name: falcon-container
+          image: REPO_HERE.azurecr.io/falcon-40b-instruct:TAG_HERE
+          command:
+            - /bin/sh
+            - -c
+            - accelerate launch --config_file config.yaml --num_processes 1 --num_machines 1 --use_deepspeed --machine_rank 0 --gpu_ids all inference-api.py
+          livenessProbe:
+            httpGet:
+              path: /healthz
+              port: 5000
+            initialDelaySeconds: 600 # 10 Min
+            periodSeconds: 10
+          readinessProbe:
+            httpGet:
+              path: /healthz
+              port: 5000
+            initialDelaySeconds: 30
+            periodSeconds: 10
+          volumeMounts:
+            - name: dshm
+              mountPath: /dev/shm
+      volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+      tolerations:
+        - effect: NoSchedule
+          key: sku
+          operator: Equal
+          value: gpu
+        - effect: NoSchedule
+          key: nvidia.com/gpu
+          operator: Exists
+      nodeSelector:
+        pool: n40binstruct
diff --git a/presets/k8s/falcon-40b/falcon-40b-service.yaml b/presets/k8s/falcon-40b/falcon-40b-service.yaml
new file mode 100644
index 000000000..599f70ca5
--- /dev/null
+++ b/presets/k8s/falcon-40b/falcon-40b-service.yaml
@@ -0,0 +1,14 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: falcon-40b
+spec:
+  selector:
+    app: falcon
+    statefulset.kubernetes.io/pod-name: falcon-40b-0
+  ports:
+    - protocol: TCP
+      port: 80
+      targetPort: 5000
+  type: LoadBalancer
+  publishNotReadyAddresses: true
diff --git a/presets/k8s/falcon-40b/falcon-40b-statefulset.yaml b/presets/k8s/falcon-40b/falcon-40b-statefulset.yaml
new file mode 100644
index 000000000..c213867df
--- /dev/null
+++ b/presets/k8s/falcon-40b/falcon-40b-statefulset.yaml
@@ -0,0 +1,51 @@
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: falcon-40b
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: falcon
+  podManagementPolicy: Parallel
+  template:
+    metadata:
+      labels:
+        app: falcon
+    spec:
+      containers:
+        - name: falcon-container
+          image: REPO_HERE.azurecr.io/falcon-40b:TAG_HERE
+          command:
+            - /bin/sh
+            - -c
+            - accelerate launch --config_file config.yaml --num_processes 1 --num_machines 1 --use_deepspeed --machine_rank 0 --gpu_ids all inference-api.py
+          livenessProbe:
+            httpGet:
+              path: /healthz
+              port: 5000
+            initialDelaySeconds: 600 # 10 Min
+            periodSeconds: 10
+          readinessProbe:
+            httpGet:
+              path: /healthz
+              port: 5000
+            initialDelaySeconds: 30
+            periodSeconds: 10
+          volumeMounts:
+            - name: dshm
+              mountPath: /dev/shm
+      volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+      tolerations:
+        - effect: NoSchedule
+          key: sku
+          operator: Equal
+          value: gpu
+        - effect: NoSchedule
+          key: nvidia.com/gpu
+          operator: Exists
+      nodeSelector:
+        pool: falcon40b