diff --git a/Makefile b/Makefile index 08ca8fcb0..4e254c1a8 100644 --- a/Makefile +++ b/Makefile @@ -319,11 +319,11 @@ azure-karpenter-helm: ## Update Azure client env vars and settings in helm valu ##@ Build .PHONY: build build: manifests generate fmt vet ## Build manager binary. - go build -o bin/manager cmd/*.go + go build -o bin/manager cmd/workspace/*.go .PHONY: run run: manifests generate fmt vet ## Run a controller from your host. - go run ./cmd/main.go + go run ./cmd/workspace/main.go ##@ Build Dependencies ## Location to install dependencies to diff --git a/api/v1alpha1/labels.go b/api/v1alpha1/labels.go index 409c08ef8..65ef0ca09 100644 --- a/api/v1alpha1/labels.go +++ b/api/v1alpha1/labels.go @@ -3,6 +3,8 @@ package v1alpha1 +import "github.com/azure/kaito/pkg/model" + const ( // Non-prefixed labels/annotations are reserved for end-use. @@ -27,4 +29,25 @@ const ( // WorkspaceRevisionAnnotation is the Annotations for revision number WorkspaceRevisionAnnotation = "workspace.kaito.io/revision" + + // AnnotationWorkspaceBackend is the annotation for backend selection. + AnnotationWorkspaceBackend = KAITOPrefix + "backend" ) + +// GetWorkspaceBackendName returns the runtime name of the workspace. +func GetWorkspaceBackendName(ws *Workspace) model.BackendName { + if ws == nil { + panic("workspace is nil") + } + runtime := model.BackendNameVLLM + + name := ws.Annotations[AnnotationWorkspaceBackend] + switch name { + case string(model.BackendNameHuggingfaceTransformers): + runtime = model.BackendNameHuggingfaceTransformers + case string(model.BackendNameVLLM): + runtime = model.BackendNameVLLM + } + + return runtime +} diff --git a/api/v1alpha1/workspace_validation.go b/api/v1alpha1/workspace_validation.go index 07e45c9b7..3493b3566 100644 --- a/api/v1alpha1/workspace_validation.go +++ b/api/v1alpha1/workspace_validation.go @@ -169,7 +169,7 @@ func (r *TuningSpec) validateCreate(ctx context.Context, workspaceNamespace stri // Currently require a preset to specified, in future we can consider defining a template if r.Preset == nil { errs = errs.Also(apis.ErrMissingField("Preset")) - } else if presetName := string(r.Preset.Name); !utils.IsValidPreset(presetName) { + } else if presetName := string(r.Preset.Name); !plugin.IsValidPreset(presetName) { errs = errs.Also(apis.ErrInvalidValue(fmt.Sprintf("Unsupported tuning preset name %s", presetName), "presetName")) } return errs @@ -407,7 +407,7 @@ func (i *InferenceSpec) validateCreate() (errs *apis.FieldError) { if i.Preset != nil { presetName := string(i.Preset.Name) // Validate preset name - if !utils.IsValidPreset(presetName) { + if !plugin.IsValidPreset(presetName) { errs = errs.Also(apis.ErrInvalidValue(fmt.Sprintf("Unsupported inference preset name %s", presetName), "presetName")) } // Validate private preset has private image specified diff --git a/docker/presets/models/tfs/Dockerfile b/docker/presets/models/tfs/Dockerfile index 61c263b7f..1b9d4cdba 100644 --- a/docker/presets/models/tfs/Dockerfile +++ b/docker/presets/models/tfs/Dockerfile @@ -5,31 +5,39 @@ ARG MODEL_TYPE ARG VERSION # Set the working directory -WORKDIR /workspace/tfs +WORKDIR /workspace -# Write the version to a file -RUN echo $VERSION > /workspace/tfs/version.txt - -# First, copy just the preset files and install dependencies -# This is done before copying the code to utilize Docker's layer caching and -# avoid reinstalling dependencies unless the requirements file changes. -# Inference COPY kaito/presets/inference/${MODEL_TYPE}/requirements.txt /workspace/tfs/inference-requirements.txt -RUN pip install --no-cache-dir -r inference-requirements.txt - -COPY kaito/presets/inference/${MODEL_TYPE}/inference_api.py /workspace/tfs/inference_api.py - -# Fine Tuning COPY kaito/presets/tuning/${MODEL_TYPE}/requirements.txt /workspace/tfs/tuning-requirements.txt -RUN pip install --no-cache-dir -r tuning-requirements.txt - -COPY kaito/presets/tuning/${MODEL_TYPE}/cli.py /workspace/tfs/cli.py -COPY kaito/presets/tuning/${MODEL_TYPE}/fine_tuning.py /workspace/tfs/fine_tuning.py -COPY kaito/presets/tuning/${MODEL_TYPE}/parser.py /workspace/tfs/parser.py -COPY kaito/presets/tuning/${MODEL_TYPE}/dataset.py /workspace/tfs/dataset.py - -# Copy the metrics server -COPY kaito/presets/tuning/${MODEL_TYPE}/metrics/metrics_server.py /workspace/tfs/metrics_server.py - -# Copy the entire model weights to the weights directory -COPY ${WEIGHTS_PATH} /workspace/tfs/weights +COPY kaito/presets/inference/vllm/requirements.txt /workspace/vllm/inference-requirements.txt + +RUN pip install --no-cache-dir -r /workspace/tfs/inference-requirements.txt && \ + pip install --no-cache-dir -r /workspace/tfs/tuning-requirements.txt && \ + pip install --no-cache-dir -r /workspace/vllm/inference-requirements.txt + +# 1. Huggingface transformers +COPY kaito/presets/inference/${MODEL_TYPE}/inference_api.py \ + kaito/presets/tuning/${MODEL_TYPE}/cli.py \ + kaito/presets/tuning/${MODEL_TYPE}/fine_tuning.py \ + kaito/presets/tuning/${MODEL_TYPE}/parser.py \ + kaito/presets/tuning/${MODEL_TYPE}/dataset.py \ + kaito/presets/tuning/${MODEL_TYPE}/metrics/metrics_server.py \ + /workspace/tfs/ + +# 2. vLLM +COPY kaito/presets/inference/vllm/inference_api.py /workspace/vllm/inference_api.py + +# Chat template +RUN apt update && apt install -y git && \ + rm /var/lib/apt/lists/* -r +RUN git clone https://github.com/chujiezheng/chat_templates /tmp/chat_templates && \ + cd /tmp/chat_templates && \ + git reset --hard 670a2eb && \ + cp -r ./chat_templates/ /workspace/ && \ + rm -rf /tmp/chat_templates + +# Model weights +COPY ${WEIGHTS_PATH} /workspace/weights +RUN echo $VERSION > /workspace/version.txt && \ + ln -s /workspace/weights /workspace/tfs/weights && \ + ln -s /workspace/weights /workspace/vllm/weights diff --git a/hack/run-pytest-in-venv.sh b/hack/run-pytest-in-venv.sh index 3e73b5cfd..e56996d3c 100755 --- a/hack/run-pytest-in-venv.sh +++ b/hack/run-pytest-in-venv.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -set -ex +set -e if [ "$#" -ne 2 ]; then echo "Usage: $0 " @@ -20,7 +20,7 @@ trap cleanup EXIT cd $VENV_DIR printf "Creating virtual environment in %s\n" "$VENV_DIR" -python3 -m virtualenv venv +python3 -m virtualenv --system-site-packages venv source "$VENV_DIR/venv/bin/activate" if [ "$?" -ne 0 ]; then printf "Failed to activate virtual environment\n" diff --git a/pkg/controllers/workspace_controller.go b/pkg/controllers/workspace_controller.go index 77a7d3f5c..c0d9028b6 100644 --- a/pkg/controllers/workspace_controller.go +++ b/pkg/controllers/workspace_controller.go @@ -830,7 +830,7 @@ func (c *WorkspaceReconciler) applyInference(ctx context.Context, wObj *kaitov1a } else if apierrors.IsNotFound(err) { var workloadObj client.Object // Need to create a new workload - workloadObj, err = inference.CreatePresetInference(ctx, wObj, revisionStr, inferenceParam, model.SupportDistributedInference(), c.Client) + workloadObj, err = inference.CreatePresetInference(ctx, wObj, revisionStr, model, c.Client) if err != nil { return } diff --git a/pkg/inference/preset-inferences.go b/pkg/inference/preset-inferences.go index 20e503d38..86c369efe 100644 --- a/pkg/inference/preset-inferences.go +++ b/pkg/inference/preset-inferences.go @@ -11,6 +11,7 @@ import ( "github.com/azure/kaito/pkg/utils" "github.com/azure/kaito/pkg/utils/consts" + "github.com/azure/kaito/api/v1alpha1" kaitov1alpha1 "github.com/azure/kaito/api/v1alpha1" "github.com/azure/kaito/pkg/model" "github.com/azure/kaito/pkg/resources" @@ -22,9 +23,8 @@ import ( ) const ( - ProbePath = "/healthz" - Port5000 = int32(5000) - InferenceFile = "inference_api.py" + ProbePath = "/health" + Port5000 = int32(5000) ) var ( @@ -70,7 +70,12 @@ var ( } ) -func updateTorchParamsForDistributedInference(ctx context.Context, kubeClient client.Client, wObj *kaitov1alpha1.Workspace, inferenceObj *model.PresetParam) error { +func updateTorchParamsForDistributedInference(ctx context.Context, kubeClient client.Client, wObj *kaitov1alpha1.Workspace, inferenceParam *model.PresetParam) error { + backendName := v1alpha1.GetWorkspaceBackendName(wObj) + if backendName != model.BackendNameHuggingfaceTransformers { + return fmt.Errorf("distributed inference is not supported for backend %s", backendName) + } + existingService := &corev1.Service{} err := resources.GetResource(ctx, wObj.Name, wObj.Namespace, kubeClient, existingService) if err != nil { @@ -78,18 +83,18 @@ func updateTorchParamsForDistributedInference(ctx context.Context, kubeClient cl } nodes := *wObj.Resource.Count - inferenceObj.TorchRunParams["nnodes"] = strconv.Itoa(nodes) - inferenceObj.TorchRunParams["nproc_per_node"] = strconv.Itoa(inferenceObj.WorldSize / nodes) + inferenceParam.Transformers.TorchRunParams["nnodes"] = strconv.Itoa(nodes) + inferenceParam.Transformers.TorchRunParams["nproc_per_node"] = strconv.Itoa(inferenceParam.WorldSize / nodes) if nodes > 1 { - inferenceObj.TorchRunParams["node_rank"] = "$(echo $HOSTNAME | grep -o '[^-]*$')" - inferenceObj.TorchRunParams["master_addr"] = existingService.Spec.ClusterIP - inferenceObj.TorchRunParams["master_port"] = "29500" - } - if inferenceObj.TorchRunRdzvParams != nil { - inferenceObj.TorchRunRdzvParams["max_restarts"] = "3" - inferenceObj.TorchRunRdzvParams["rdzv_id"] = "job" - inferenceObj.TorchRunRdzvParams["rdzv_backend"] = "c10d" - inferenceObj.TorchRunRdzvParams["rdzv_endpoint"] = + inferenceParam.Transformers.TorchRunParams["node_rank"] = "$(echo $HOSTNAME | grep -o '[^-]*$')" + inferenceParam.Transformers.TorchRunParams["master_addr"] = existingService.Spec.ClusterIP + inferenceParam.Transformers.TorchRunParams["master_port"] = "29500" + } + if inferenceParam.Transformers.TorchRunRdzvParams != nil { + inferenceParam.Transformers.TorchRunRdzvParams["max_restarts"] = "3" + inferenceParam.Transformers.TorchRunRdzvParams["rdzv_id"] = "job" + inferenceParam.Transformers.TorchRunRdzvParams["rdzv_backend"] = "c10d" + inferenceParam.Transformers.TorchRunRdzvParams["rdzv_endpoint"] = fmt.Sprintf("%s-0.%s-headless.%s.svc.cluster.local:29500", wObj.Name, wObj.Name, wObj.Namespace) } return nil @@ -114,14 +119,17 @@ func GetInferenceImageInfo(ctx context.Context, workspaceObj *kaitov1alpha1.Work } func CreatePresetInference(ctx context.Context, workspaceObj *kaitov1alpha1.Workspace, revisionNum string, - inferenceObj *model.PresetParam, supportDistributedInference bool, kubeClient client.Client) (client.Object, error) { - if inferenceObj.TorchRunParams != nil && supportDistributedInference { - if err := updateTorchParamsForDistributedInference(ctx, kubeClient, workspaceObj, inferenceObj); err != nil { + model model.Model, kubeClient client.Client) (client.Object, error) { + inferenceParam := model.GetInferenceParameters().DeepCopy() + + if model.SupportDistributedInference() { + if err := updateTorchParamsForDistributedInference(ctx, kubeClient, workspaceObj, inferenceParam); err != nil { // klog.ErrorS(err, "failed to update torch params", "workspace", workspaceObj) return nil, err } } + // additional volume var volumes []corev1.Volume var volumeMounts []corev1.VolumeMount shmVolume, shmVolumeMount := utils.ConfigSHMVolume(*workspaceObj.Resource.Count) @@ -131,24 +139,35 @@ func CreatePresetInference(ctx context.Context, workspaceObj *kaitov1alpha1.Work if shmVolumeMount.Name != "" { volumeMounts = append(volumeMounts, shmVolumeMount) } - if len(workspaceObj.Inference.Adapters) > 0 { adapterVolume, adapterVolumeMount := utils.ConfigAdapterVolume() volumes = append(volumes, adapterVolume) volumeMounts = append(volumeMounts, adapterVolumeMount) } + // resource requirements skuNumGPUs, err := utils.GetSKUNumGPUs(ctx, kubeClient, workspaceObj.Status.WorkerNodes, - workspaceObj.Resource.InstanceType, inferenceObj.GPUCountRequirement) + workspaceObj.Resource.InstanceType, inferenceParam.GPUCountRequirement) if err != nil { return nil, fmt.Errorf("failed to get SKU num GPUs: %v", err) } + resourceReq := corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceName(resources.CapacityNvidiaGPU): resource.MustParse(skuNumGPUs), + }, + Limits: corev1.ResourceList{ + corev1.ResourceName(resources.CapacityNvidiaGPU): resource.MustParse(skuNumGPUs), + }, + } - commands, resourceReq := prepareInferenceParameters(ctx, inferenceObj, skuNumGPUs) - image, imagePullSecrets := GetInferenceImageInfo(ctx, workspaceObj, inferenceObj) + // inference command + backendName := v1alpha1.GetWorkspaceBackendName(workspaceObj) + commands := inferenceParam.GetInferenceCommand(backendName) + + image, imagePullSecrets := GetInferenceImageInfo(ctx, workspaceObj, inferenceParam) var depObj client.Object - if supportDistributedInference { + if model.SupportDistributedInference() { depObj = resources.GenerateStatefulSetManifest(ctx, workspaceObj, image, imagePullSecrets, *workspaceObj.Resource.Count, commands, containerPorts, livenessProbe, readinessProbe, resourceReq, tolerations, volumes, volumeMounts) } else { @@ -161,25 +180,3 @@ func CreatePresetInference(ctx context.Context, workspaceObj *kaitov1alpha1.Work } return depObj, nil } - -// prepareInferenceParameters builds a PyTorch command: -// torchrun baseCommand -// and sets the GPU resources required for inference. -// Returns the command and resource configuration. -func prepareInferenceParameters(ctx context.Context, inferenceObj *model.PresetParam, skuNumGPUs string) ([]string, corev1.ResourceRequirements) { - torchCommand := utils.BuildCmdStr(inferenceObj.BaseCommand, inferenceObj.TorchRunParams) - torchCommand = utils.BuildCmdStr(torchCommand, inferenceObj.TorchRunRdzvParams) - modelCommand := utils.BuildCmdStr(InferenceFile, inferenceObj.ModelRunParams) - commands := utils.ShellCmd(torchCommand + " " + modelCommand) - - resourceRequirements := corev1.ResourceRequirements{ - Requests: corev1.ResourceList{ - corev1.ResourceName(resources.CapacityNvidiaGPU): resource.MustParse(skuNumGPUs), - }, - Limits: corev1.ResourceList{ - corev1.ResourceName(resources.CapacityNvidiaGPU): resource.MustParse(skuNumGPUs), - }, - } - - return commands, resourceRequirements -} diff --git a/pkg/inference/preset-inferences_test.go b/pkg/inference/preset-inferences_test.go index 31b7f4e84..400f66fb8 100644 --- a/pkg/inference/preset-inferences_test.go +++ b/pkg/inference/preset-inferences_test.go @@ -15,7 +15,6 @@ import ( "github.com/azure/kaito/api/v1alpha1" "github.com/azure/kaito/pkg/utils/test" - "github.com/azure/kaito/pkg/model" "github.com/azure/kaito/pkg/utils/plugin" "github.com/stretchr/testify/mock" appsv1 "k8s.io/api/apps/v1" @@ -28,6 +27,7 @@ var ValidStrength string = "0.5" func TestCreatePresetInference(t *testing.T) { test.RegisterTestModel() testcases := map[string]struct { + workspace *v1alpha1.Workspace nodeCount int modelName string callMocks func(c *test.MockClient) @@ -37,7 +37,8 @@ func TestCreatePresetInference(t *testing.T) { expectedVolume string }{ - "test-model": { + "test-model/vllm": { + workspace: test.MockWorkspaceWithPreset, nodeCount: 1, modelName: "test-model", callMocks: func(c *test.MockClient) { @@ -46,32 +47,48 @@ func TestCreatePresetInference(t *testing.T) { workload: "Deployment", // No BaseCommand, TorchRunParams, TorchRunRdzvParams, or ModelRunParams // So expected cmd consists of shell command and inference file - expectedCmd: "/bin/sh -c inference_api.py", + expectedCmd: "/bin/sh -c python3 /workspace/vllm/inference_api.py", hasAdapters: false, }, - "test-distributed-model": { + "test-model-with-adapters/vllm": { + workspace: test.MockWorkspaceWithPreset, nodeCount: 1, - modelName: "test-distributed-model", + modelName: "test-model", callMocks: func(c *test.MockClient) { - c.On("Get", mock.IsType(context.TODO()), mock.Anything, mock.IsType(&corev1.Service{}), mock.Anything).Return(nil) - c.On("Create", mock.IsType(context.TODO()), mock.IsType(&appsv1.StatefulSet{}), mock.Anything).Return(nil) + c.On("Create", mock.IsType(context.TODO()), mock.IsType(&appsv1.Deployment{}), mock.Anything).Return(nil) }, - workload: "StatefulSet", - expectedCmd: "/bin/sh -c inference_api.py", - hasAdapters: false, + workload: "Deployment", + expectedCmd: "/bin/sh -c python3 /workspace/vllm/inference_api.py", + hasAdapters: true, + expectedVolume: "adapter-volume", }, - "test-model-with-adapters": { + "test-model/transformers": { + workspace: test.MockWorkspaceWithPresetTransformers, nodeCount: 1, modelName: "test-model", callMocks: func(c *test.MockClient) { c.On("Create", mock.IsType(context.TODO()), mock.IsType(&appsv1.Deployment{}), mock.Anything).Return(nil) }, - workload: "Deployment", - expectedCmd: "/bin/sh -c inference_api.py", - hasAdapters: true, - expectedVolume: "adapter-volume", + workload: "Deployment", + // No BaseCommand, TorchRunParams, TorchRunRdzvParams, or ModelRunParams + // So expected cmd consists of shell command and inference file + expectedCmd: "/bin/sh -c accelerate launch /workspace/tfs/inference_api.py", + hasAdapters: false, + }, + + "test-distributed-model/transformers": { + workspace: test.MockWorkspaceDistributedModelTransformers, + nodeCount: 1, + modelName: "test-distributed-model", + callMocks: func(c *test.MockClient) { + c.On("Get", mock.IsType(context.TODO()), mock.Anything, mock.IsType(&corev1.Service{}), mock.Anything).Return(nil) + c.On("Create", mock.IsType(context.TODO()), mock.IsType(&appsv1.StatefulSet{}), mock.Anything).Return(nil) + }, + workload: "StatefulSet", + expectedCmd: "/bin/sh -c accelerate launch --nnodes=1 --nproc_per_node=0 --max_restarts=3 --rdzv_id=job --rdzv_backend=c10d --rdzv_endpoint=testWorkspace-0.testWorkspace-headless.kaito.svc.cluster.local:29500 /workspace/tfs/inference_api.p", + hasAdapters: false, }, } @@ -81,7 +98,7 @@ func TestCreatePresetInference(t *testing.T) { mockClient := test.NewClient() tc.callMocks(mockClient) - workspace := test.MockWorkspaceWithPreset + workspace := tc.workspace workspace.Resource.Count = &tc.nodeCount if tc.hasAdapters { @@ -96,15 +113,8 @@ func TestCreatePresetInference(t *testing.T) { } } - useHeadlessSvc := false - - var inferenceObj *model.PresetParam model := plugin.KaitoModelRegister.MustGet(tc.modelName) - inferenceObj = model.GetInferenceParameters() - if strings.Contains(tc.modelName, "distributed") { - useHeadlessSvc = true - } svc := &corev1.Service{ ObjectMeta: v1.ObjectMeta{ Name: workspace.Name, @@ -116,7 +126,7 @@ func TestCreatePresetInference(t *testing.T) { } mockClient.CreateOrUpdateObjectInMap(svc) - createdObject, _ := CreatePresetInference(context.TODO(), workspace, test.MockWorkspaceWithPresetHash, inferenceObj, useHeadlessSvc, mockClient) + createdObject, _ := CreatePresetInference(context.TODO(), workspace, test.MockWorkspaceWithPresetHash, model, mockClient) createdWorkload := "" switch createdObject.(type) { case *appsv1.Deployment: diff --git a/pkg/model/interface.go b/pkg/model/interface.go index 56c925698..6f9996d33 100644 --- a/pkg/model/interface.go +++ b/pkg/model/interface.go @@ -4,6 +4,8 @@ package model import ( "time" + + "github.com/azure/kaito/pkg/utils" ) type Model interface { @@ -13,23 +15,131 @@ type Model interface { SupportTuning() bool } +// BackendName is LLM runtime name. +type BackendName string + +const ( + BackendNameHuggingfaceTransformers BackendName = "transformers" + BackendNameVLLM BackendName = "vllm" + + InferenceFileHuggingface = "/workspace/tfs/inference_api.py" + InferenceFileVLLM = "/workspace/vllm/inference_api.py" +) + // PresetParam defines the preset inference parameters for a model. type PresetParam struct { - ModelFamilyName string // The name of the model family. - ImageAccessMode string // Defines where the Image is Public or Private. - DiskStorageRequirement string // Disk storage requirements for the model. - GPUCountRequirement string // Number of GPUs required for the Preset. Used for inference. - TotalGPUMemoryRequirement string // Total GPU memory required for the Preset. Used for inference. - PerGPUMemoryRequirement string // GPU memory required per GPU. Used for inference. - TuningPerGPUMemoryRequirement map[string]int // Min GPU memory per tuning method (batch size 1). Used for tuning. - TorchRunParams map[string]string // Parameters for configuring the torchrun command. - TorchRunRdzvParams map[string]string // Optional rendezvous parameters for distributed training/inference using torchrun (elastic). - BaseCommand string // The initial command (e.g., 'torchrun', 'accelerate launch') used in the command line. - ModelRunParams map[string]string // Parameters for running the model training/inference. + Tag string // The model image tag + ModelFamilyName string // The name of the model family. + ImageAccessMode string // Defines where the Image is Public or Private. + + DiskStorageRequirement string // Disk storage requirements for the model. + GPUCountRequirement string // Number of GPUs required for the Preset. Used for inference. + TotalGPUMemoryRequirement string // Total GPU memory required for the Preset. Used for inference. + PerGPUMemoryRequirement string // GPU memory required per GPU. Used for inference. + TuningPerGPUMemoryRequirement map[string]int // Min GPU memory per tuning method (batch size 1). Used for tuning. + WorldSize int // Defines the number of processes required for distributed inference. + + BackendParam + // ReadinessTimeout defines the maximum duration for creating the workload. // This timeout accommodates the size of the image, ensuring pull completion // even under slower network conditions or unforeseen delays. ReadinessTimeout time.Duration - WorldSize int // Defines the number of processes required for distributed inference. - Tag string // The model image tag +} + +// BackendParam defines the llm backend parameters. +type BackendParam struct { + Transformers HuggingfaceTransformersParam + VLLM VLLMParam +} + +type HuggingfaceTransformersParam struct { + BaseCommand string // The initial command (e.g., 'torchrun', 'accelerate launch') used in the command line. + TorchRunParams map[string]string // Parameters for configuring the torchrun command. + TorchRunRdzvParams map[string]string // Optional rendezvous parameters for distributed training/inference using torchrun (elastic). + ModelRunParams map[string]string // Parameters for running the model training/inference. +} + +type VLLMParam struct { + BaseCommand string // The initial command (e.g., 'torchrun', 'accelerate launch') used in the command line. + DistributionParams map[string]string // Parameters for distributed inference. + ModelRunParams map[string]string // Parameters for running the model training/inference. +} + +func (p *PresetParam) DeepCopy() *PresetParam { + if p == nil { + return nil + } + out := new(PresetParam) + *out = *p + out.BackendParam = p.BackendParam.DeepCopy() + out.TuningPerGPUMemoryRequirement = make(map[string]int, len(p.TuningPerGPUMemoryRequirement)) + for k, v := range p.TuningPerGPUMemoryRequirement { + out.TuningPerGPUMemoryRequirement[k] = v + } + return out +} + +func (rp *BackendParam) DeepCopy() BackendParam { + if rp == nil { + return BackendParam{} + } + out := BackendParam{} + out.Transformers = rp.Transformers.DeepCopy() + out.VLLM = rp.VLLM.DeepCopy() + return out +} + +func (h *HuggingfaceTransformersParam) DeepCopy() HuggingfaceTransformersParam { + if h == nil { + return HuggingfaceTransformersParam{} + } + out := HuggingfaceTransformersParam{} + out.BaseCommand = h.BaseCommand + out.TorchRunParams = make(map[string]string, len(h.TorchRunParams)) + for k, v := range h.TorchRunParams { + out.TorchRunParams[k] = v + } + out.TorchRunRdzvParams = make(map[string]string, len(h.TorchRunRdzvParams)) + for k, v := range h.TorchRunRdzvParams { + out.TorchRunRdzvParams[k] = v + } + out.ModelRunParams = make(map[string]string, len(h.ModelRunParams)) + for k, v := range h.ModelRunParams { + out.ModelRunParams[k] = v + } + return out +} + +func (v *VLLMParam) DeepCopy() VLLMParam { + if v == nil { + return VLLMParam{} + } + out := VLLMParam{} + out.BaseCommand = v.BaseCommand + out.DistributionParams = make(map[string]string, len(v.DistributionParams)) + for k, v := range v.DistributionParams { + out.DistributionParams[k] = v + } + out.ModelRunParams = make(map[string]string, len(v.ModelRunParams)) + for k, v := range v.ModelRunParams { + out.ModelRunParams[k] = v + } + return out +} + +// builds the container command: +// eg. torchrun baseCommand +func (p *PresetParam) GetInferenceCommand(backend BackendName) []string { + switch backend { + case BackendNameHuggingfaceTransformers: + torchCommand := utils.BuildCmdStr(p.Transformers.BaseCommand, p.Transformers.TorchRunParams, p.Transformers.TorchRunRdzvParams) + modelCommand := utils.BuildCmdStr(InferenceFileHuggingface, p.Transformers.ModelRunParams) + return utils.ShellCmd(torchCommand + " " + modelCommand) + case BackendNameVLLM: + modelCommand := utils.BuildCmdStr(InferenceFileVLLM, p.VLLM.ModelRunParams) + return utils.ShellCmd(p.VLLM.BaseCommand + " " + modelCommand) + default: + return nil + } } diff --git a/pkg/tuning/preset-tuning.go b/pkg/tuning/preset-tuning.go index ad81115a5..5743d2718 100644 --- a/pkg/tuning/preset-tuning.go +++ b/pkg/tuning/preset-tuning.go @@ -490,7 +490,7 @@ func handleURLDataSource(ctx context.Context, workspaceObj *kaitov1alpha1.Worksp } func prepareModelRunParameters(ctx context.Context, tuningObj *model.PresetParam) (string, error) { - modelCommand := utils.BuildCmdStr(TuningFile, tuningObj.ModelRunParams) + modelCommand := utils.BuildCmdStr(TuningFile, tuningObj.Transformers.ModelRunParams) return modelCommand, nil } @@ -500,14 +500,14 @@ func prepareModelRunParameters(ctx context.Context, tuningObj *model.PresetParam // Returns the command and resource configuration. func prepareTuningParameters(ctx context.Context, wObj *kaitov1alpha1.Workspace, modelCommand string, tuningObj *model.PresetParam, skuNumGPUs string) ([]string, corev1.ResourceRequirements) { - if tuningObj.TorchRunParams == nil { - tuningObj.TorchRunParams = make(map[string]string) + hfParam := tuningObj.Transformers // Only support Huggingface for now + if hfParam.TorchRunParams == nil { + hfParam.TorchRunParams = make(map[string]string) } // Set # of processes to GPU Count numProcesses := getInstanceGPUCount(wObj.Resource.InstanceType) - tuningObj.TorchRunParams["num_processes"] = fmt.Sprintf("%d", numProcesses) - torchCommand := utils.BuildCmdStr(tuningObj.BaseCommand, tuningObj.TorchRunParams) - torchCommand = utils.BuildCmdStr(torchCommand, tuningObj.TorchRunRdzvParams) + hfParam.TorchRunParams["num_processes"] = fmt.Sprintf("%d", numProcesses) + torchCommand := utils.BuildCmdStr(hfParam.BaseCommand, hfParam.TorchRunParams, hfParam.TorchRunRdzvParams) commands := utils.ShellCmd(torchCommand + " " + modelCommand) resourceRequirements := corev1.ResourceRequirements{ diff --git a/pkg/tuning/preset-tuning_test.go b/pkg/tuning/preset-tuning_test.go index 99344ddcb..405e86271 100644 --- a/pkg/tuning/preset-tuning_test.go +++ b/pkg/tuning/preset-tuning_test.go @@ -416,9 +416,13 @@ func TestPrepareTuningParameters(t *testing.T) { }, modelCommand: "model-command", tuningObj: &model.PresetParam{ - BaseCommand: "python train.py", - TorchRunParams: map[string]string{}, - TorchRunRdzvParams: map[string]string{}, + BackendParam: model.BackendParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: "python train.py", + TorchRunParams: map[string]string{}, + TorchRunRdzvParams: map[string]string{}, + }, + }, GPUCountRequirement: "2", }, expectedCommands: []string{"/bin/sh", "-c", "python train.py --num_processes=1 model-command"}, diff --git a/pkg/utils/common-preset.go b/pkg/utils/common-preset.go index df96276d4..c9f5f8dd0 100644 --- a/pkg/utils/common-preset.go +++ b/pkg/utils/common-preset.go @@ -3,7 +3,6 @@ package utils import ( - "github.com/azure/kaito/pkg/utils/plugin" corev1 "k8s.io/api/core/v1" ) @@ -150,7 +149,3 @@ func ConfigAdapterVolume() (corev1.Volume, corev1.VolumeMount) { } return volume, volumeMount } - -func IsValidPreset(preset string) bool { - return plugin.KaitoModelRegister.Has(preset) -} diff --git a/pkg/utils/common.go b/pkg/utils/common.go index 94209bf83..8b53f14f6 100644 --- a/pkg/utils/common.go +++ b/pkg/utils/common.go @@ -66,13 +66,15 @@ func MergeConfigMaps(baseMap, overrideMap map[string]string) map[string]string { return merged } -func BuildCmdStr(baseCommand string, runParams map[string]string) string { +func BuildCmdStr(baseCommand string, runParams ...map[string]string) string { updatedBaseCommand := baseCommand - for key, value := range runParams { - if value == "" { - updatedBaseCommand = fmt.Sprintf("%s --%s", updatedBaseCommand, key) - } else { - updatedBaseCommand = fmt.Sprintf("%s --%s=%s", updatedBaseCommand, key, value) + for _, runParam := range runParams { + for key, value := range runParam { + if value == "" { + updatedBaseCommand = fmt.Sprintf("%s --%s", updatedBaseCommand, key) + } else { + updatedBaseCommand = fmt.Sprintf("%s --%s=%s", updatedBaseCommand, key, value) + } } } diff --git a/pkg/utils/common_test.go b/pkg/utils/common_test.go index e23997692..b214150e7 100644 --- a/pkg/utils/common_test.go +++ b/pkg/utils/common_test.go @@ -2,7 +2,6 @@ package utils import ( "context" - "sigs.k8s.io/controller-runtime/pkg/client" "testing" "github.com/stretchr/testify/assert" @@ -12,6 +11,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/kubernetes/scheme" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" ) diff --git a/pkg/utils/plugin/plugin.go b/pkg/utils/plugin/plugin.go index 6f186a9f9..e8708e7f0 100644 --- a/pkg/utils/plugin/plugin.go +++ b/pkg/utils/plugin/plugin.go @@ -60,3 +60,7 @@ func (reg *ModelRegister) Has(name string) bool { _, ok := reg.models[name] return ok } + +func IsValidPreset(preset string) bool { + return KaitoModelRegister.Has(preset) +} diff --git a/pkg/utils/test/testModel.go b/pkg/utils/test/testModel.go index d12d3f720..83f52b66a 100644 --- a/pkg/utils/test/testModel.go +++ b/pkg/utils/test/testModel.go @@ -15,7 +15,15 @@ type testModel struct{} func (*testModel) GetInferenceParameters() *model.PresetParam { return &model.PresetParam{ GPUCountRequirement: "1", - ReadinessTimeout: time.Duration(30) * time.Minute, + BackendParam: model.BackendParam{ + VLLM: model.VLLMParam{ + BaseCommand: "python3", + }, + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: "accelerate launch", + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, } } func (*testModel) GetTuningParameters() *model.PresetParam { @@ -36,7 +44,15 @@ type testDistributedModel struct{} func (*testDistributedModel) GetInferenceParameters() *model.PresetParam { return &model.PresetParam{ GPUCountRequirement: "1", - ReadinessTimeout: time.Duration(30) * time.Minute, + BackendParam: model.BackendParam{ + VLLM: model.VLLMParam{ + BaseCommand: "python3", + }, + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: "accelerate launch", + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, } } func (*testDistributedModel) GetTuningParameters() *model.PresetParam { diff --git a/pkg/utils/test/testUtils.go b/pkg/utils/test/testUtils.go index 3e6410efc..ba6c762c6 100644 --- a/pkg/utils/test/testUtils.go +++ b/pkg/utils/test/testUtils.go @@ -6,6 +6,7 @@ package test import ( "github.com/aws/karpenter-core/pkg/apis/v1alpha5" "github.com/azure/kaito/api/v1alpha1" + "github.com/azure/kaito/pkg/model" "github.com/samber/lo" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" @@ -47,6 +48,31 @@ var ( }, }, } + MockWorkspaceDistributedModelTransformers = &v1alpha1.Workspace{ + ObjectMeta: metav1.ObjectMeta{ + Name: "testWorkspace", + Namespace: "kaito", + Annotations: map[string]string{ + v1alpha1.AnnotationWorkspaceBackend: string(model.BackendNameHuggingfaceTransformers), + }, + }, + Resource: v1alpha1.ResourceSpec{ + Count: &gpuNodeCount, + InstanceType: "Standard_NC12s_v3", + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "apps": "test", + }, + }, + }, + Inference: &v1alpha1.InferenceSpec{ + Preset: &v1alpha1.PresetSpec{ + PresetMeta: v1alpha1.PresetMeta{ + Name: "test-distributed-model", + }, + }, + }, + } MockWorkspaceWithPreferredNodes = &v1alpha1.Workspace{ ObjectMeta: metav1.ObjectMeta{ Name: "testWorkspace", @@ -115,6 +141,31 @@ var ( }, }, } + MockWorkspaceWithPresetTransformers = &v1alpha1.Workspace{ + ObjectMeta: metav1.ObjectMeta{ + Name: "testWorkspace", + Namespace: "kaito", + Annotations: map[string]string{ + v1alpha1.AnnotationWorkspaceBackend: string(model.BackendNameHuggingfaceTransformers), + }, + }, + Resource: v1alpha1.ResourceSpec{ + Count: &gpuNodeCount, + InstanceType: "Standard_NC12s_v3", + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "apps": "test", + }, + }, + }, + Inference: &v1alpha1.InferenceSpec{ + Preset: &v1alpha1.PresetSpec{ + PresetMeta: v1alpha1.PresetMeta{ + Name: "test-model", + }, + }, + }, + } ) var MockWorkspaceWithPresetHash = "89ae127050ec264a5ce84db48ef7226574cdf1299e6bd27fe90b927e34cc8adb" diff --git a/presets/inference/text-generation/inference_api.py b/presets/inference/text-generation/inference_api.py index e9aca92e4..eb875cdeb 100644 --- a/presets/inference/text-generation/inference_api.py +++ b/presets/inference/text-generation/inference_api.py @@ -2,9 +2,12 @@ # Licensed under the MIT license. import logging import os -import subprocess +import sys +import signal +import codecs +from pathlib import Path from dataclasses import asdict, dataclass, field -from typing import Annotated, Any, Dict, List, Optional +from typing import Annotated, Any, Dict, List, Optional, Union import GPUtil import psutil @@ -30,7 +33,7 @@ class ModelConfig: """ Transformers Model Configuration Parameters """ - pipeline: str = field(metadata={"help": "The model pipeline for the pre-trained model"}) + pipeline: Optional[str] = field(default="text-generation", metadata={"help": "The model pipeline for the pre-trained model"}) pretrained_model_name_or_path: Optional[str] = field(default="/workspace/tfs/weights", metadata={"help": "Path to the pretrained model or model identifier from huggingface.co/models"}) combination_type: Optional[str]=field(default="svd", metadata={"help": "The combination type of multi adapters"}) state_dict: Optional[Dict[str, Any]] = field(default=None, metadata={"help": "State dictionary for the model"}) @@ -47,6 +50,7 @@ class ModelConfig: load_in_8bit: bool = field(default=False, metadata={"help": "Load model in 8-bit mode"}) torch_dtype: Optional[str] = field(default=None, metadata={"help": "The torch dtype for the pre-trained model"}) device_map: str = field(default="auto", metadata={"help": "The device map for the pre-trained model"}) + chat_template: Optional[str] = field(default=None, metadata={"help": "The file path to the chat template, or the template in single-line form for the specified model"}) # Method to process additional arguments def process_additional_args(self, addt_args: List[str]): @@ -83,7 +87,22 @@ def __post_init__(self): # validate parameters supported_pipelines = {"conversational", "text-generation"} if self.pipeline not in supported_pipelines: raise ValueError(f"Unsupported pipeline: {self.pipeline}") - + +def load_chat_template(chat_template: Optional[str]) -> Optional[str]: + logger.info(chat_template) + if chat_template is None: + return None + + JINJA_CHARS = "{}\n" + if any(c in chat_template for c in JINJA_CHARS): + resolved_chat_template = codecs.decode(chat_template, "unicode_escape") + else: + resolved_chat_template = Path(chat_template).read_text() + + logger.info("Chat template loaded successfully") + logger.info("Chat template: %s", resolved_chat_template) + return resolved_chat_template + parser = HfArgumentParser(ModelConfig) args, additional_args = parser.parse_args_into_dataclasses( @@ -98,7 +117,10 @@ def __post_init__(self): # validate parameters combination_type = model_args.pop('combination_type') app = FastAPI() +resovled_chat_template = load_chat_template(model_args.pop('chat_template')) tokenizer = AutoTokenizer.from_pretrained(**model_args) +if resovled_chat_template is not None: + tokenizer.chat_template = resovled_chat_template base_model = AutoModelForCausalLM.from_pretrained(**model_args) if not os.path.exists(ADAPTERS_DIR): @@ -153,7 +175,7 @@ def __post_init__(self): # validate parameters pipeline_kwargs["torch_dtype"] = args.torch_dtype pipeline = transformers.pipeline( - model_pipeline, + task="text-generation", model=model, tokenizer=tokenizer, **pipeline_kwargs @@ -492,7 +514,11 @@ def get_metrics(): logger.error(f"Error fetching metrics: {e}") raise HTTPException(status_code=500, detail=str(e)) +def shutdown_handler(sig, frame): + sys.exit(0) + if __name__ == "__main__": + signal.signal(signal.SIGINT, shutdown_handler) local_rank = int(os.environ.get("LOCAL_RANK", 0)) # Default to 0 if not set port = 5000 + local_rank # Adjust port based on local rank logger.info(f"Starting server on port {port}") diff --git a/presets/inference/text-generation/requirements.txt b/presets/inference/text-generation/requirements.txt index 2530fed6a..bc92259a5 100644 --- a/presets/inference/text-generation/requirements.txt +++ b/presets/inference/text-generation/requirements.txt @@ -1,14 +1,14 @@ # Dependencies for TFS # Core Dependencies -transformers==4.41.2 -torch==2.2.0 +transformers >= 4.45.0 +torch==2.4.0 accelerate==0.30.1 fastapi>=0.111.0,<0.112.0 # Allow patch updates -pydantic==2.7.4 +pydantic>=2.9 uvicorn[standard]>=0.29.0,<0.30.0 # Allow patch updates peft==0.11.1 -numpy==1.22.4 +numpy<3.0,>=1.25.0 sentencepiece==0.2.0 # Utility libraries diff --git a/presets/inference/text-generation/tests/test_inference_api.py b/presets/inference/text-generation/tests/test_inference_api.py index baedbb832..e315d372d 100644 --- a/presets/inference/text-generation/tests/test_inference_api.py +++ b/presets/inference/text-generation/tests/test_inference_api.py @@ -13,6 +13,10 @@ # Add the parent directory to sys.path sys.path.append(parent_dir) +CHAT_TEMPLATE = ("{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}" + "{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}" + "{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}") + @pytest.fixture(params=[ {"pipeline": "text-generation", "model_path": "stanford-crfm/alias-gpt2-small-x21", "device": "cpu"}, {"pipeline": "conversational", "model_path": "stanford-crfm/alias-gpt2-small-x21", "device": "cpu"}, @@ -25,7 +29,8 @@ def configured_app(request): '--pipeline', request.param['pipeline'], '--pretrained_model_name_or_path', request.param['model_path'], '--device_map', request.param['device'], - '--allow_remote_files', 'True' + '--allow_remote_files', 'True', + '--chat_template', CHAT_TEMPLATE ] sys.argv = test_args diff --git a/presets/inference/vllm/inference_api.py b/presets/inference/vllm/inference_api.py index ab2613e9e..5b9a2d881 100644 --- a/presets/inference/vllm/inference_api.py +++ b/presets/inference/vllm/inference_api.py @@ -25,7 +25,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: # See https://docs.vllm.ai/en/latest/models/engine_args.html for more args engine_default_args = { - "model": "/workspace/tfs/weights", + "model": "/workspace/vllm/weights", "cpu-offload-gb": 0, "gpu-memory-utilization": 0.9, "swap-space": 4, diff --git a/presets/inference/vllm/requirements.txt b/presets/inference/vllm/requirements.txt index 4481a9966..e54487e83 100644 --- a/presets/inference/vllm/requirements.txt +++ b/presets/inference/vllm/requirements.txt @@ -3,9 +3,10 @@ # Core Dependencies vllm==0.6.3 torch==2.4.0 +transformers >= 4.45.0 uvloop numpy # For UTs pytest -requests \ No newline at end of file +requests diff --git a/presets/models/falcon/model.go b/presets/models/falcon/model.go index 74c39995f..dc9b2fb00 100644 --- a/presets/models/falcon/model.go +++ b/presets/models/falcon/model.go @@ -3,9 +3,10 @@ package falcon import ( - "github.com/azure/kaito/pkg/tuning" "time" + "github.com/azure/kaito/pkg/tuning" + kaitov1alpha1 "github.com/azure/kaito/api/v1alpha1" "github.com/azure/kaito/pkg/inference" "github.com/azure/kaito/pkg/model" @@ -47,8 +48,13 @@ var ( baseCommandPresetFalconInference = "accelerate launch" baseCommandPresetFalconTuning = "python3 metrics_server.py & accelerate launch" falconRunParams = map[string]string{ - "torch_dtype": "bfloat16", - "pipeline": "text-generation", + "torch_dtype": "bfloat16", + "pipeline": "text-generation", + "chat_template": "/workspace/chat_templates/falcon-instruct.jinja", + } + falconRunParamsVLLM = map[string]string{ + "dtype": "bfloat16", + "chat-template": "/workspace/chat_templates/falcon-instruct.jinja", } ) @@ -64,11 +70,19 @@ func (*falcon7b) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "14Gi", PerGPUMemoryRequirement: "0Gi", // We run Falcon using native vertical model parallel, no per GPU memory requirement. - TorchRunParams: inference.DefaultAccelerateParams, - ModelRunParams: falconRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetFalconInference, - Tag: PresetFalconTagMap["Falcon7B"], + BackendParam: model.BackendParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetFalconInference, + TorchRunParams: inference.DefaultAccelerateParams, + ModelRunParams: falconRunParams, + }, + VLLM: model.VLLMParam{ + BaseCommand: "python3", + ModelRunParams: falconRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + Tag: PresetFalconTagMap["Falcon7B"], } } func (*falcon7b) GetTuningParameters() *model.PresetParam { @@ -79,10 +93,14 @@ func (*falcon7b) GetTuningParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "16Gi", PerGPUMemoryRequirement: "16Gi", - TorchRunParams: tuning.DefaultAccelerateParams, - //ModelRunPrams: falconRunTuningParams, // TODO + BackendParam: model.BackendParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetFalconTuning, + TorchRunParams: tuning.DefaultAccelerateParams, + //ModelRunPrams: falconRunTuningParams, // TODO + }, + }, ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetFalconTuning, Tag: PresetFalconTagMap["Falcon7B"], TuningPerGPUMemoryRequirement: map[string]int{"qlora": 16}, } @@ -107,11 +125,19 @@ func (*falcon7bInst) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "14Gi", PerGPUMemoryRequirement: "0Gi", // We run Falcon using native vertical model parallel, no per GPU memory requirement. - TorchRunParams: inference.DefaultAccelerateParams, - ModelRunParams: falconRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetFalconInference, - Tag: PresetFalconTagMap["Falcon7BInstruct"], + BackendParam: model.BackendParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetFalconInference, + TorchRunParams: inference.DefaultAccelerateParams, + ModelRunParams: falconRunParams, + }, + VLLM: model.VLLMParam{ + BaseCommand: "python3", + ModelRunParams: falconRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + Tag: PresetFalconTagMap["Falcon7BInstruct"], } } @@ -137,13 +163,20 @@ func (*falcon40b) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "2", TotalGPUMemoryRequirement: "90Gi", PerGPUMemoryRequirement: "0Gi", // We run Falcon using native vertical model parallel, no per GPU memory requirement. - TorchRunParams: inference.DefaultAccelerateParams, - ModelRunParams: falconRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetFalconInference, - Tag: PresetFalconTagMap["Falcon40B"], + BackendParam: model.BackendParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetFalconInference, + TorchRunParams: inference.DefaultAccelerateParams, + ModelRunParams: falconRunParams, + }, + VLLM: model.VLLMParam{ + BaseCommand: "python3", + ModelRunParams: falconRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + Tag: PresetFalconTagMap["Falcon40B"], } - } func (*falcon40b) GetTuningParameters() *model.PresetParam { return &model.PresetParam{ @@ -153,10 +186,14 @@ func (*falcon40b) GetTuningParameters() *model.PresetParam { GPUCountRequirement: "2", TotalGPUMemoryRequirement: "90Gi", PerGPUMemoryRequirement: "16Gi", - TorchRunParams: tuning.DefaultAccelerateParams, - //ModelRunPrams: falconRunTuningParams, // TODO + BackendParam: model.BackendParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetFalconTuning, + TorchRunParams: tuning.DefaultAccelerateParams, + //ModelRunPrams: falconRunTuningParams, // TODO + }, + }, ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetFalconTuning, Tag: PresetFalconTagMap["Falcon40B"], } } @@ -179,11 +216,19 @@ func (*falcon40bInst) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "2", TotalGPUMemoryRequirement: "90Gi", PerGPUMemoryRequirement: "0Gi", // We run Falcon using native vertical model parallel, no per GPU memory requirement. - TorchRunParams: inference.DefaultAccelerateParams, - ModelRunParams: falconRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetFalconInference, - Tag: PresetFalconTagMap["Falcon40BInstruct"], + BackendParam: model.BackendParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetFalconInference, + TorchRunParams: inference.DefaultAccelerateParams, + ModelRunParams: falconRunParams, + }, + VLLM: model.VLLMParam{ + BaseCommand: "python3", + ModelRunParams: falconRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + Tag: PresetFalconTagMap["Falcon40BInstruct"], } } func (*falcon40bInst) GetTuningParameters() *model.PresetParam { diff --git a/presets/models/llama2/model.go b/presets/models/llama2/model.go index 6a62a8987..c8581798c 100644 --- a/presets/models/llama2/model.go +++ b/presets/models/llama2/model.go @@ -32,6 +32,9 @@ var ( "max_seq_len": "512", "max_batch_size": "8", } + llamaRunParamsVLLM = map[string]string{ + "max-seq-len-to-capture": "512", + } ) var llama2A llama2Text7b @@ -46,15 +49,22 @@ func (*llama2Text7b) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "14Gi", PerGPUMemoryRequirement: "14Gi", // We run llama2 using tensor parallelism, the memory of each GPU needs to be bigger than the tensor shard size. - TorchRunParams: inference.DefaultTorchRunParams, - TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams, - ModelRunParams: llamaRunParams, - ReadinessTimeout: time.Duration(10) * time.Minute, - BaseCommand: baseCommandPresetLlama, - WorldSize: 1, + BackendParam: model.BackendParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetLlama, + TorchRunParams: inference.DefaultTorchRunParams, + TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams, + ModelRunParams: llamaRunParams, + }, + VLLM: model.VLLMParam{ + BaseCommand: "python3", + ModelRunParams: llamaRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(10) * time.Minute, + WorldSize: 1, // Tag: llama has private image access mode. The image tag is determined by the user. } - } func (*llama2Text7b) GetTuningParameters() *model.PresetParam { return nil // Currently doesn't support fine-tuning @@ -78,12 +88,20 @@ func (*llama2Text13b) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "2", TotalGPUMemoryRequirement: "30Gi", PerGPUMemoryRequirement: "15Gi", // We run llama2 using tensor parallelism, the memory of each GPU needs to be bigger than the tensor shard size. - TorchRunParams: inference.DefaultTorchRunParams, - TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams, - ModelRunParams: llamaRunParams, - ReadinessTimeout: time.Duration(20) * time.Minute, - BaseCommand: baseCommandPresetLlama, - WorldSize: 2, + BackendParam: model.BackendParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetLlama, + TorchRunParams: inference.DefaultTorchRunParams, + TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams, + ModelRunParams: llamaRunParams, + }, + VLLM: model.VLLMParam{ + BaseCommand: "python3", + ModelRunParams: llamaRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(20) * time.Minute, + WorldSize: 2, // Tag: llama has private image access mode. The image tag is determined by the user. } } @@ -109,12 +127,20 @@ func (*llama2Text70b) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "8", TotalGPUMemoryRequirement: "152Gi", PerGPUMemoryRequirement: "19Gi", // We run llama2 using tensor parallelism, the memory of each GPU needs to be bigger than the tensor shard size. - TorchRunParams: inference.DefaultTorchRunParams, - TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams, - ModelRunParams: llamaRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetLlama, - WorldSize: 8, + BackendParam: model.BackendParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetLlama, + TorchRunParams: inference.DefaultTorchRunParams, + TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams, + ModelRunParams: llamaRunParams, + }, + VLLM: model.VLLMParam{ + BaseCommand: "python3", + ModelRunParams: llamaRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + WorldSize: 8, // Tag: llama has private image access mode. The image tag is determined by the user. } } diff --git a/presets/models/llama2chat/model.go b/presets/models/llama2chat/model.go index 89225bef5..72aa04f40 100644 --- a/presets/models/llama2chat/model.go +++ b/presets/models/llama2chat/model.go @@ -32,6 +32,9 @@ var ( "max_seq_len": "512", "max_batch_size": "8", } + llamaRunParamsVLLM = map[string]string{ + "max-seq-len-to-capture": "512", + } ) var llama2chatA llama2Chat7b @@ -46,12 +49,20 @@ func (*llama2Chat7b) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "16Gi", PerGPUMemoryRequirement: "14Gi", // We run llama2 using tensor parallelism, the memory of each GPU needs to be bigger than the tensor shard size. - TorchRunParams: inference.DefaultTorchRunParams, - TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams, - ModelRunParams: llamaRunParams, - ReadinessTimeout: time.Duration(10) * time.Minute, - BaseCommand: baseCommandPresetLlama, - WorldSize: 1, + BackendParam: model.BackendParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetLlama, + TorchRunParams: inference.DefaultTorchRunParams, + TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams, + ModelRunParams: llamaRunParams, + }, + VLLM: model.VLLMParam{ + BaseCommand: "python3", + ModelRunParams: llamaRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(10) * time.Minute, + WorldSize: 1, // Tag: llama has private image access mode. The image tag is determined by the user. } } @@ -77,12 +88,21 @@ func (*llama2Chat13b) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "2", TotalGPUMemoryRequirement: "30Gi", PerGPUMemoryRequirement: "15Gi", // We run llama2 using tensor parallelism, the memory of each GPU needs to be bigger than the tensor shard size. - TorchRunParams: inference.DefaultTorchRunParams, - TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams, - ModelRunParams: llamaRunParams, - ReadinessTimeout: time.Duration(20) * time.Minute, - BaseCommand: baseCommandPresetLlama, - WorldSize: 2, + BackendParam: model.BackendParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetLlama, + TorchRunParams: inference.DefaultTorchRunParams, + TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams, + ModelRunParams: llamaRunParams, + }, + VLLM: model.VLLMParam{ + BaseCommand: "python3", + ModelRunParams: llamaRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(20) * time.Minute, + + WorldSize: 2, // Tag: llama has private image access mode. The image tag is determined by the user. } } @@ -108,12 +128,20 @@ func (*llama2Chat70b) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "8", TotalGPUMemoryRequirement: "192Gi", PerGPUMemoryRequirement: "19Gi", // We run llama2 using tensor parallelism, the memory of each GPU needs to be bigger than the tensor shard size. - TorchRunParams: inference.DefaultTorchRunParams, - TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams, - ModelRunParams: llamaRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetLlama, - WorldSize: 8, + BackendParam: model.BackendParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetLlama, + TorchRunParams: inference.DefaultTorchRunParams, + TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams, + ModelRunParams: llamaRunParams, + }, + VLLM: model.VLLMParam{ + BaseCommand: "python3", + ModelRunParams: llamaRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + WorldSize: 8, // Tag: llama has private image access mode. The image tag is determined by the user. } } diff --git a/presets/models/mistral/model.go b/presets/models/mistral/model.go index b4581d6f1..a7b62e022 100644 --- a/presets/models/mistral/model.go +++ b/presets/models/mistral/model.go @@ -36,6 +36,11 @@ var ( mistralRunParams = map[string]string{ "torch_dtype": "bfloat16", "pipeline": "text-generation", + "chat_template": "/workspace/chat_templates/mistral-instruct.jinja", + } + mistralRunParamsVLLM = map[string]string{ + "dtype": "bfloat16", + "chat-template": "/workspace/chat_templates/mistral-instruct.jinja", } ) @@ -51,11 +56,19 @@ func (*mistral7b) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "14Gi", PerGPUMemoryRequirement: "0Gi", // We run Mistral using native vertical model parallel, no per GPU memory requirement. - TorchRunParams: inference.DefaultAccelerateParams, - ModelRunParams: mistralRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetMistralInference, - Tag: PresetMistralTagMap["Mistral7B"], + BackendParam: model.BackendParam{ + Transformers: model.HuggingfaceTransformersParam{ + TorchRunParams: inference.DefaultAccelerateParams, + ModelRunParams: mistralRunParams, + BaseCommand: baseCommandPresetMistralInference, + }, + VLLM: model.VLLMParam{ + BaseCommand: "python3", + ModelRunParams: mistralRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + Tag: PresetMistralTagMap["Mistral7B"], } } @@ -67,10 +80,14 @@ func (*mistral7b) GetTuningParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "16Gi", PerGPUMemoryRequirement: "16Gi", - //TorchRunParams: tuning.DefaultAccelerateParams, - //ModelRunParams: mistralRunParams, + BackendParam: model.BackendParam{ + Transformers: model.HuggingfaceTransformersParam{ + //TorchRunParams: tuning.DefaultAccelerateParams, + //ModelRunParams: mistralRunParams, + BaseCommand: baseCommandPresetMistralTuning, + }, + }, ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetMistralTuning, Tag: PresetMistralTagMap["Mistral7B"], } } @@ -94,11 +111,19 @@ func (*mistral7bInst) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "16Gi", PerGPUMemoryRequirement: "0Gi", // We run mistral using native vertical model parallel, no per GPU memory requirement. - TorchRunParams: inference.DefaultAccelerateParams, - ModelRunParams: mistralRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetMistralInference, - Tag: PresetMistralTagMap["Mistral7BInstruct"], + BackendParam: model.BackendParam{ + Transformers: model.HuggingfaceTransformersParam{ + TorchRunParams: inference.DefaultAccelerateParams, + ModelRunParams: mistralRunParams, + BaseCommand: baseCommandPresetMistralInference, + }, + VLLM: model.VLLMParam{ + BaseCommand: "python3", + ModelRunParams: mistralRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + Tag: PresetMistralTagMap["Mistral7BInstruct"], } } diff --git a/presets/models/phi2/model.go b/presets/models/phi2/model.go index 07fb8e0d2..77cc5c46b 100644 --- a/presets/models/phi2/model.go +++ b/presets/models/phi2/model.go @@ -31,6 +31,9 @@ var ( "torch_dtype": "float16", "pipeline": "text-generation", } + phiRunParamsVLLM = map[string]string{ + "dtype": "float16", + } ) var phiA phi2 @@ -45,11 +48,19 @@ func (*phi2) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "12Gi", PerGPUMemoryRequirement: "0Gi", // We run Phi using native vertical model parallel, no per GPU memory requirement. - TorchRunParams: inference.DefaultAccelerateParams, - ModelRunParams: phiRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetPhiInference, - Tag: PresetPhiTagMap["Phi2"], + BackendParam: model.BackendParam{ + Transformers: model.HuggingfaceTransformersParam{ + TorchRunParams: inference.DefaultAccelerateParams, + ModelRunParams: phiRunParams, + BaseCommand: baseCommandPresetPhiInference, + }, + VLLM: model.VLLMParam{ + BaseCommand: "python3", + ModelRunParams: phiRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + Tag: PresetPhiTagMap["Phi2"], } } func (*phi2) GetTuningParameters() *model.PresetParam { @@ -60,10 +71,14 @@ func (*phi2) GetTuningParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "16Gi", PerGPUMemoryRequirement: "16Gi", - // TorchRunParams: inference.DefaultAccelerateParams, - // ModelRunParams: phiRunParams, + BackendParam: model.BackendParam{ + Transformers: model.HuggingfaceTransformersParam{ + // TorchRunParams: inference.DefaultAccelerateParams, + // ModelRunParams: phiRunParams, + BaseCommand: baseCommandPresetPhiTuning, + }, + }, ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetPhiTuning, Tag: PresetPhiTagMap["Phi2"], } } diff --git a/presets/models/phi3/model.go b/presets/models/phi3/model.go index 5656fc15a..4f2218495 100644 --- a/presets/models/phi3/model.go +++ b/presets/models/phi3/model.go @@ -50,6 +50,9 @@ var ( "pipeline": "text-generation", "trust_remote_code": "", } + phiRunParamsVLLM = map[string]string{ + "dtype": "auto", + } ) var phi3MiniA phi3Mini4KInst @@ -64,11 +67,19 @@ func (*phi3Mini4KInst) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "9Gi", PerGPUMemoryRequirement: "0Gi", // We run Phi using native vertical model parallel, no per GPU memory requirement. - TorchRunParams: inference.DefaultAccelerateParams, - ModelRunParams: phiRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetPhiInference, - Tag: PresetPhiTagMap["Phi3Mini4kInstruct"], + BackendParam: model.BackendParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetPhiInference, + TorchRunParams: inference.DefaultAccelerateParams, + ModelRunParams: phiRunParams, + }, + VLLM: model.VLLMParam{ + BaseCommand: "python3", + ModelRunParams: phiRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + Tag: PresetPhiTagMap["Phi3Mini4kInstruct"], } } func (*phi3Mini4KInst) GetTuningParameters() *model.PresetParam { @@ -82,8 +93,12 @@ func (*phi3Mini4KInst) GetTuningParameters() *model.PresetParam { // TorchRunParams: inference.DefaultAccelerateParams, // ModelRunParams: phiRunParams, ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetPhiTuning, - Tag: PresetPhiTagMap["Phi3Mini4kInstruct"], + BackendParam: model.BackendParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetPhiTuning, + }, + }, + Tag: PresetPhiTagMap["Phi3Mini4kInstruct"], } } func (*phi3Mini4KInst) SupportDistributedInference() bool { return false } @@ -103,11 +118,19 @@ func (*phi3Mini128KInst) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "9Gi", PerGPUMemoryRequirement: "0Gi", // We run Phi using native vertical model parallel, no per GPU memory requirement. - TorchRunParams: inference.DefaultAccelerateParams, - ModelRunParams: phiRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetPhiInference, - Tag: PresetPhiTagMap["Phi3Mini128kInstruct"], + BackendParam: model.BackendParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetPhiInference, + TorchRunParams: inference.DefaultAccelerateParams, + ModelRunParams: phiRunParams, + }, + VLLM: model.VLLMParam{ + BaseCommand: "python3", + ModelRunParams: phiRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + Tag: PresetPhiTagMap["Phi3Mini128kInstruct"], } } func (*phi3Mini128KInst) GetTuningParameters() *model.PresetParam { @@ -118,11 +141,17 @@ func (*phi3Mini128KInst) GetTuningParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "72Gi", PerGPUMemoryRequirement: "72Gi", - // TorchRunParams: inference.DefaultAccelerateParams, - // ModelRunParams: phiRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetPhiTuning, - Tag: PresetPhiTagMap["Phi3Mini128kInstruct"], + ReadinessTimeout: time.Duration(30) * time.Minute, + BackendParam: model.BackendParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetPhiTuning, + }, + VLLM: model.VLLMParam{ + BaseCommand: "python3", + ModelRunParams: phiRunParamsVLLM, + }, + }, + Tag: PresetPhiTagMap["Phi3Mini128kInstruct"], } } func (*phi3Mini128KInst) SupportDistributedInference() bool { return false } @@ -142,11 +171,19 @@ func (*Phi3Medium4kInstruct) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "28Gi", PerGPUMemoryRequirement: "0Gi", // We run Phi using native vertical model parallel, no per GPU memory requirement. - TorchRunParams: inference.DefaultAccelerateParams, - ModelRunParams: phiRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetPhiInference, - Tag: PresetPhiTagMap["Phi3Medium4kInstruct"], + BackendParam: model.BackendParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetPhiInference, + TorchRunParams: inference.DefaultAccelerateParams, + ModelRunParams: phiRunParams, + }, + VLLM: model.VLLMParam{ + BaseCommand: "python3", + ModelRunParams: phiRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + Tag: PresetPhiTagMap["Phi3Medium4kInstruct"], } } func (*Phi3Medium4kInstruct) GetTuningParameters() *model.PresetParam { @@ -160,8 +197,12 @@ func (*Phi3Medium4kInstruct) GetTuningParameters() *model.PresetParam { // TorchRunParams: inference.DefaultAccelerateParams, // ModelRunParams: phiRunParams, ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetPhiTuning, - Tag: PresetPhiTagMap["Phi3Medium4kInstruct"], + BackendParam: model.BackendParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetPhiTuning, + }, + }, + Tag: PresetPhiTagMap["Phi3Medium4kInstruct"], } } func (*Phi3Medium4kInstruct) SupportDistributedInference() bool { return false } @@ -181,11 +222,15 @@ func (*Phi3Medium128kInstruct) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "28Gi", PerGPUMemoryRequirement: "0Gi", // We run Phi using native vertical model parallel, no per GPU memory requirement. - TorchRunParams: inference.DefaultAccelerateParams, - ModelRunParams: phiRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetPhiInference, - Tag: PresetPhiTagMap["Phi3Medium128kInstruct"], + BackendParam: model.BackendParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetPhiInference, + TorchRunParams: inference.DefaultAccelerateParams, + ModelRunParams: phiRunParams, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + Tag: PresetPhiTagMap["Phi3Medium128kInstruct"], } } func (*Phi3Medium128kInstruct) GetTuningParameters() *model.PresetParam { @@ -196,11 +241,13 @@ func (*Phi3Medium128kInstruct) GetTuningParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "80Gi", PerGPUMemoryRequirement: "80Gi", - // TorchRunParams: inference.DefaultAccelerateParams, - // ModelRunParams: phiRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetPhiTuning, - Tag: PresetPhiTagMap["Phi3Medium128kInstruct"], + ReadinessTimeout: time.Duration(30) * time.Minute, + BackendParam: model.BackendParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetPhiTuning, + }, + }, + Tag: PresetPhiTagMap["Phi3Medium128kInstruct"], } } func (*Phi3Medium128kInstruct) SupportDistributedInference() bool { return false } diff --git a/presets/models/supported_models.yaml b/presets/models/supported_models.yaml index be25c83f5..d82eb5582 100644 --- a/presets/models/supported_models.yaml +++ b/presets/models/supported_models.yaml @@ -104,8 +104,9 @@ models: type: text-generation version: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/commit/d269012bea6fbe38ce7752c8940fea010eea3383 runtime: tfs - tag: 0.0.2 + tag: 0.0.3 # Tag history: + # 0.0.3 - Add vllm inference backend # 0.0.2 - Add Logging & Metrics Server # 0.0.1 - Initial Release diff --git a/presets/tuning/text-generation/requirements.txt b/presets/tuning/text-generation/requirements.txt index e2aeb3098..3da067c95 100644 --- a/presets/tuning/text-generation/requirements.txt +++ b/presets/tuning/text-generation/requirements.txt @@ -1,6 +1,6 @@ # Core Dependencies -transformers==4.41.2 -torch==2.2.0 +transformers >= 4.45.0 +torch==2.4.0 accelerate==0.30.1 fastapi>=0.111.0,<0.112.0 # Allow patch updates pydantic==2.7.4