Merge branch 'nod-ai:main' into fix_worker_idx

nod-ai · Jan 31, 2025 · fd8ea7f · fd8ea7f
2 parents 0257e7e + 7671d57
commit fd8ea7f
Show file tree

Hide file tree

Showing 37 changed files with 619 additions and 545 deletions.
diff --git a/.github/workflows/pkgci_shark_ai.yml b/.github/workflows/pkgci_shark_ai.yml
@@ -26,7 +26,7 @@ jobs:
       matrix:
         version: [3.11]
       fail-fast: false
-    runs-on: mi300x-3
+    runs-on: azure-cpubuilder-linux-scale
     # runs-on: ubuntu-latest # everything else works but this throws an "out of resources" during model loading
     # TODO: make a copy of this that runs on standard runners with tiny llama instead of a 8b model
     defaults:
@@ -44,10 +44,6 @@ jobs:
         with:
           python-version: ${{matrix.version}}
 
-      - name: Set Python version without dot
-        run: |
-          echo "PY_VERSION_NO_DOT=$(echo ${{ matrix.version }} | tr -d '.')" >> $GITHUB_ENV
-
       - name: Setup UV caching
         run: |
           CACHE_DIR="${GITHUB_WORKSPACE}/.uv-cache"
@@ -60,23 +56,12 @@ jobs:
           path: .uv-cache
           key: ${{ runner.os }}-uv-py${{ matrix.version }}-${{ hashFiles('requirements-iree-pinned.txt', 'pytorch-cpu-requirements.txt', 'sharktank/requirements.txt', 'sharktank/requirements-tests.txt', 'shortfin/requirements-tests.txt') }}
 
-      - name: Download sharktank artifacts
-        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
-        with:
-          name: snapshot-sharktank-linux-x86_64-cp${{ env.PY_VERSION_NO_DOT }}-cp${{ env.PY_VERSION_NO_DOT }}
-          path: ${{ env.PACKAGE_DOWNLOAD_DIR }}
-
-      - name: Download shortfin artifacts
-        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
-        with:
-          name: snapshot-shortfin-linux-x86_64-cp${{ env.PY_VERSION_NO_DOT }}-cp${{ env.PY_VERSION_NO_DOT }}
-          path: ${{ env.PACKAGE_DOWNLOAD_DIR }}
-
-      - name: Download shark-ai artifacts
+      - name: Download package artifacts
         uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
         with:
-          name: snapshot-shark-ai-linux-x86_64-cp${{ env.PY_VERSION_NO_DOT }}-cp${{ env.PY_VERSION_NO_DOT }}
+          pattern: snapshot-*-linux-x86_64-*
           path: ${{ env.PACKAGE_DOWNLOAD_DIR }}
+          merge-multiple: true
 
       - name: Setup venv
         run: |

diff --git a/.github/workflows/update_iree_requirement_pins.yml b/.github/workflows/update_iree_requirement_pins.yml
@@ -25,6 +25,25 @@ env:
   GIT_BRANCH_NAME: ${{ inputs.branch-name || 'integrates/iree' }}
 
 jobs:
+  determine-duty-engineer:
+    runs-on: ubuntu-24.04
+    outputs:
+      duty-engineer: ${{ steps.set-duty.outputs.engineer }}
+    steps:
+      - name: Determine IREE bump duty engineer
+        id: set-duty
+        run: |
+          # rotation schedule (GitHub usernames); add yourself into this schedule to get notified when it's your turn.
+          ENGINEERS=('renxida')
+
+          # current week number (1-53)
+          WEEK=$(date +%V)
+
+          INDEX=$(( WEEK % ${#ENGINEERS[@]} ))
+          DUTY_ENGINEER=${ENGINEERS[$INDEX]}
+          echo "engineer=$DUTY_ENGINEER" >> $GITHUB_OUTPUT
+          echo "This week's IREE bump duty engineer: $DUTY_ENGINEER"
+
   check-for-existing-branch:
     if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }}
     runs-on: ubuntu-24.04
@@ -43,8 +62,8 @@ jobs:
           fi
 
   update-iree:
-    needs: check-for-existing-branch
-    runs-on: ubuntu-22.04
+    needs: [check-for-existing-branch, determine-duty-engineer]
+    runs-on: ubuntu-24.04
     if: ${{ needs.check-for-existing-branch.outputs.branch-exists == 0 }}
 
     steps:
@@ -87,10 +106,14 @@ jobs:
           body: |
             Diff: https://github.com/iree-org/iree/compare/iree-${{ env.CURRENT_IREE_BASE_COMPILER_VERSION }}...iree-${{ env.LATEST_IREE_BASE_COMPILER_VERSION }}
 
+            IREE bump duty engineer this week: @${{ needs.determine-duty-engineer.outputs.duty-engineer }}
+
             Auto-generated by GitHub Actions using [`.github/workflows/update_iree_requirement_pins.yml`](https://github.com/${{ github.repository }}/blob/main/.github/workflows/update_iree_requirement_pins.yml).
           commit-message: "Bump IREE to ${{ env.LATEST_IREE_BASE_COMPILER_VERSION }}."
+          assignees: ${{ needs.determine-duty-engineer.outputs.duty-engineer }}
 
       - name: Write summary
         if: ${{ steps.cpr.outputs.pull-request-number }}
         run: |
           echo "Pull Request URL: ${{ steps.cpr.outputs.pull-request-url }}" >> ${GITHUB_STEP_SUMMARY}
+          echo "IREE bump duty engineer: @${{ needs.determine-duty-engineer.outputs.duty-engineer }}" >> ${GITHUB_STEP_SUMMARY}
diff --git a/app_tests/benchmark_tests/llm/sglang_benchmarks/conftest.py b/app_tests/benchmark_tests/llm/sglang_benchmarks/conftest.py
@@ -14,13 +14,12 @@
 sys.path.append(
     os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
 )
-from integration_tests.llm.utils import (
-    compile_model,
-    end_log_group,
-    export_paged_llm_v1,
-    download_with_hf_datasets,
-    start_log_group,
+from integration_tests.llm.model_management import (
+    ModelConfig,
+    ModelProcessor,
+    ModelSource,
 )
+from integration_tests.llm.logging_utils import start_log_group, end_log_group
 
 logger = logging.getLogger(__name__)
 
@@ -47,16 +46,19 @@ def pre_process_model(request, tmp_path_factory):
     settings = request.param["settings"]
     batch_sizes = request.param["batch_sizes"]
 
-    mlir_path = tmp_dir / "model.mlir"
-    config_path = tmp_dir / "config.json"
-    vmfb_path = tmp_dir / "model.vmfb"
-
-    model_path = tmp_dir / model_param_file_name
-    download_with_hf_datasets(tmp_dir, model_name)
-
-    export_paged_llm_v1(mlir_path, config_path, model_path, batch_sizes)
+    # Configure model
+    config = ModelConfig(
+        model_file=model_param_file_name,
+        tokenizer_id=model_name,  # Using model_name as tokenizer_id, adjust if needed
+        batch_sizes=batch_sizes,
+        device_settings=settings,
+        source=ModelSource.HUGGINGFACE,
+        repo_id=model_name,  # Using model_name as repo_id, adjust if needed
+    )
 
-    compile_model(mlir_path, vmfb_path, settings)
+    # Process model through all stages
+    processor = ModelProcessor(tmp_dir)
+    artifacts = processor.process_model(config)
 
     logger.info("Model artifacts setup successfully" + end_log_group())
     MODEL_DIR_CACHE[param_key] = tmp_dir

diff --git a/app_tests/benchmark_tests/llm/sglang_benchmarks/sglang_benchmark_test.py b/app_tests/benchmark_tests/llm/sglang_benchmarks/sglang_benchmark_test.py
@@ -13,8 +13,12 @@
 from sglang import bench_serving
 
 from .utils import SGLangBenchmarkArgs, log_jsonl_result
-
-from integration_tests.llm.utils import download_tokenizer, wait_for_server
+from integration_tests.llm.model_management import (
+    ModelConfig,
+    ModelProcessor,
+    ModelSource,
+)
+from integration_tests.llm.server_management import ServerInstance
 
 logger = logging.getLogger(__name__)
 
@@ -26,17 +30,31 @@
 def test_sglang_benchmark(request_rate, tokenizer_id, sglang_args, tmp_path_factory):
     tmp_dir = tmp_path_factory.mktemp("sglang_benchmark_test")
 
-    # Download tokenizer for llama3_8B_fp16
-    download_tokenizer(tmp_dir, tokenizer_id)
+    # Download tokenizer using ModelProcessor
+    config = ModelConfig(
+        model_file="tokenizer.json",  # Only need tokenizer
+        tokenizer_id=tokenizer_id,
+        batch_sizes=(1,),  # Not relevant for tokenizer only
+        device_settings=None,  # Not relevant for tokenizer only
+        source=ModelSource.HUGGINGFACE,
+        repo_id=tokenizer_id,
+    )
+    processor = ModelProcessor(tmp_dir)
+    artifacts = processor.process_model(config)
 
     logger.info("Beginning SGLang benchmark test...")
 
     port = sglang_args
     base_url = f"http://localhost:{port}"
 
-    # Setting a high timeout gives enough time for downloading model artifacts
-    # and starting up server... Takes a little longer than shortfin.
-    wait_for_server(base_url, timeout=600)
+    # Wait for server using ServerInstance's method
+    server = ServerInstance(
+        None
+    )  # We don't need config since we're just using wait_for_ready
+    server.port = int(port)  # Set port manually since we didn't start the server
+    server.wait_for_ready(
+        timeout=600
+    )  # High timeout for model artifacts download and server startup
 
     benchmark_args = SGLangBenchmarkArgs(
         backend="sglang",

diff --git a/app_tests/benchmark_tests/llm/sglang_benchmarks/shortfin_benchmark_test.py b/app_tests/benchmark_tests/llm/sglang_benchmarks/shortfin_benchmark_test.py
@@ -19,12 +19,9 @@
     log_jsonl_result,
 )
 
-from integration_tests.llm.utils import (
-    end_log_group,
-    find_available_port,
-    start_llm_server,
-    start_log_group,
-)
+from integration_tests.llm.logging_utils import end_log_group, start_log_group
+from integration_tests.llm.server_management import ServerConfig, ServerInstance
+from integration_tests.llm.model_management import ModelArtifacts
 
 logger = logging.getLogger(__name__)
 
@@ -83,20 +80,24 @@ def test_shortfin_benchmark(
     model_path = tmp_dir / model_param_file_name
 
     # Start shortfin llm server
-    server_process, port = start_llm_server(
-        tokenizer_path,
-        config_path,
-        vmfb_path,
-        model_path,
-        device_settings,
-        timeout=30,
+    server_config = ServerConfig(
+        artifacts=ModelArtifacts(
+            weights_path=model_path,
+            tokenizer_path=tokenizer_path,
+            mlir_path=tmp_dir / "model.mlir",
+            vmfb_path=vmfb_path,
+            config_path=config_path,
+        ),
+        device_settings=device_settings,
     )
+    server = ServerInstance(server_config)
+    server.start()
 
     # Run and collect SGLang Serving Benchmark
     benchmark_args = SGLangBenchmarkArgs(
         backend="shortfin",
         num_prompt=10,
-        base_url=f"http://localhost:{port}",
+        base_url=f"http://localhost:{server.port}",
         tokenizer=tmp_dir,
         request_rate=request_rate,
     )
@@ -130,5 +131,4 @@ def test_shortfin_benchmark(
     except Exception as e:
         logger.error(e)
 
-    server_process.terminate()
-    server_process.wait()
+    server.stop()
diff --git a/app_tests/integration_tests/llm/logging_utils.py b/app_tests/integration_tests/llm/logging_utils.py
@@ -0,0 +1,21 @@
+# Copyright 2024 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import os
+
+
+def start_log_group(headline):
+    """Start a collapsible log group in GitHub Actions."""
+    if os.environ.get("GITHUB_ACTIONS") == "true":
+        return f"\n::group::{headline}"
+    return ""
+
+
+def end_log_group():
+    """End a collapsible log group in GitHub Actions."""
+    if os.environ.get("GITHUB_ACTIONS") == "true":
+        return "\n::endgroup::"
+    return ""