From 933107e6f8ff41dacca90523ba1f2f5a55de7318 Mon Sep 17 00:00:00 2001 From: jerryzhuang Date: Thu, 17 Oct 2024 17:42:13 +1100 Subject: [PATCH 1/3] feat: support vllm in controller - set vllm as the default runtime by a featureflag Signed-off-by: jerryzhuang --- Makefile | 4 +- api/v1alpha1/labels.go | 30 +++ api/v1alpha1/workspace_validation.go | 4 +- pkg/featuregates/featuregates.go | 1 + pkg/model/interface.go | 145 +++++++++++-- pkg/utils/common-preset.go | 10 +- pkg/utils/common.go | 14 +- pkg/utils/common_test.go | 2 +- pkg/utils/consts/consts.go | 5 +- pkg/utils/plugin/plugin.go | 4 + pkg/utils/test/testModel.go | 24 ++- pkg/utils/test/testUtils.go | 26 +++ .../controllers/workspace_controller.go | 2 +- .../inference/preset-inference-types.go | 3 + pkg/workspace/inference/preset-inferences.go | 99 +++++---- .../inference/preset-inferences_test.go | 56 +++-- pkg/workspace/tuning/preset-tuning.go | 14 +- pkg/workspace/tuning/preset-tuning_test.go | 10 +- presets/workspace/models/falcon/model.go | 120 ++++++++--- presets/workspace/models/llama2/model.go | 52 +++-- presets/workspace/models/llama2chat/model.go | 52 +++-- presets/workspace/models/mistral/model.go | 65 ++++-- presets/workspace/models/phi2/model.go | 37 +++- presets/workspace/models/phi3/model.go | 161 ++++++++++----- test/e2e/preset_vllm_test.go | 195 ++++++++++++++++++ test/e2e/utils/utils.go | 15 ++ 26 files changed, 892 insertions(+), 258 deletions(-) create mode 100644 test/e2e/preset_vllm_test.go diff --git a/Makefile b/Makefile index 97a58365b..99ca81f7f 100644 --- a/Makefile +++ b/Makefile @@ -126,8 +126,8 @@ GINKGO_FOCUS ?= GINKGO_SKIP ?= GINKGO_NODES ?= 2 GINKGO_NO_COLOR ?= false -GINKGO_TIMEOUT ?= 180m -GINKGO_ARGS ?= -focus="$(GINKGO_FOCUS)" -skip="$(GINKGO_SKIP)" -nodes=$(GINKGO_NODES) -no-color=$(GINKGO_NO_COLOR) -timeout=$(GINKGO_TIMEOUT) --fail-fast +GINKGO_TIMEOUT ?= 120m +GINKGO_ARGS ?= -focus="$(GINKGO_FOCUS)" -skip="$(GINKGO_SKIP)" -nodes=$(GINKGO_NODES) -no-color=$(GINKGO_NO_COLOR) --output-interceptor-mode=none -timeout=$(GINKGO_TIMEOUT) $(E2E_TEST): (cd test/e2e && go test -c . -o $(E2E_TEST)) diff --git a/api/v1alpha1/labels.go b/api/v1alpha1/labels.go index 7e807c29a..e66c8d427 100644 --- a/api/v1alpha1/labels.go +++ b/api/v1alpha1/labels.go @@ -3,6 +3,12 @@ package v1alpha1 +import ( + "github.com/kaito-project/kaito/pkg/featuregates" + "github.com/kaito-project/kaito/pkg/model" + "github.com/kaito-project/kaito/pkg/utils/consts" +) + const ( // Non-prefixed labels/annotations are reserved for end-use. @@ -30,4 +36,28 @@ const ( // RAGEngineRevisionAnnotation is the Annotations for revision number RAGEngineRevisionAnnotation = "ragengine.kaito.io/revision" + + // AnnotationWorkspaceRuntime is the annotation for runtime selection. + AnnotationWorkspaceRuntime = KAITOPrefix + "runtime" ) + +// GetWorkspaceRuntimeName returns the runtime name of the workspace. +func GetWorkspaceRuntimeName(ws *Workspace) model.RuntimeName { + if ws == nil { + panic("workspace is nil") + } + runtime := model.RuntimeNameHuggingfaceTransformers + if featuregates.FeatureGates[consts.FeatureFlagVLLM] { + runtime = model.RuntimeNameVLLM + } + + name := ws.Annotations[AnnotationWorkspaceRuntime] + switch name { + case string(model.RuntimeNameHuggingfaceTransformers): + runtime = model.RuntimeNameHuggingfaceTransformers + case string(model.RuntimeNameVLLM): + runtime = model.RuntimeNameVLLM + } + + return runtime +} diff --git a/api/v1alpha1/workspace_validation.go b/api/v1alpha1/workspace_validation.go index 832108990..792318374 100644 --- a/api/v1alpha1/workspace_validation.go +++ b/api/v1alpha1/workspace_validation.go @@ -166,7 +166,7 @@ func (r *TuningSpec) validateCreate(ctx context.Context, workspaceNamespace stri // Currently require a preset to specified, in future we can consider defining a template if r.Preset == nil { errs = errs.Also(apis.ErrMissingField("Preset")) - } else if presetName := string(r.Preset.Name); !utils.IsValidPreset(presetName) { + } else if presetName := string(r.Preset.Name); !plugin.IsValidPreset(presetName) { errs = errs.Also(apis.ErrInvalidValue(fmt.Sprintf("Unsupported tuning preset name %s", presetName), "presetName")) } return errs @@ -404,7 +404,7 @@ func (i *InferenceSpec) validateCreate() (errs *apis.FieldError) { if i.Preset != nil { presetName := string(i.Preset.Name) // Validate preset name - if !utils.IsValidPreset(presetName) { + if !plugin.IsValidPreset(presetName) { errs = errs.Also(apis.ErrInvalidValue(fmt.Sprintf("Unsupported inference preset name %s", presetName), "presetName")) } // Validate private preset has private image specified diff --git a/pkg/featuregates/featuregates.go b/pkg/featuregates/featuregates.go index 2161210e1..3fea1b2f5 100644 --- a/pkg/featuregates/featuregates.go +++ b/pkg/featuregates/featuregates.go @@ -15,6 +15,7 @@ var ( // FeatureGates is a map that holds the feature gates and their default values for Kaito. FeatureGates = map[string]bool{ consts.FeatureFlagKarpenter: false, + consts.FeatureFlagVLLM: false, // Add more feature gates here } ) diff --git a/pkg/model/interface.go b/pkg/model/interface.go index 56c925698..5585d6bf8 100644 --- a/pkg/model/interface.go +++ b/pkg/model/interface.go @@ -4,6 +4,8 @@ package model import ( "time" + + "github.com/kaito-project/kaito/pkg/utils" ) type Model interface { @@ -13,23 +15,140 @@ type Model interface { SupportTuning() bool } +// RuntimeName is LLM runtime name. +type RuntimeName string + +const ( + RuntimeNameHuggingfaceTransformers RuntimeName = "transformers" + RuntimeNameVLLM RuntimeName = "vllm" +) + // PresetParam defines the preset inference parameters for a model. type PresetParam struct { - ModelFamilyName string // The name of the model family. - ImageAccessMode string // Defines where the Image is Public or Private. - DiskStorageRequirement string // Disk storage requirements for the model. - GPUCountRequirement string // Number of GPUs required for the Preset. Used for inference. - TotalGPUMemoryRequirement string // Total GPU memory required for the Preset. Used for inference. - PerGPUMemoryRequirement string // GPU memory required per GPU. Used for inference. - TuningPerGPUMemoryRequirement map[string]int // Min GPU memory per tuning method (batch size 1). Used for tuning. - TorchRunParams map[string]string // Parameters for configuring the torchrun command. - TorchRunRdzvParams map[string]string // Optional rendezvous parameters for distributed training/inference using torchrun (elastic). - BaseCommand string // The initial command (e.g., 'torchrun', 'accelerate launch') used in the command line. - ModelRunParams map[string]string // Parameters for running the model training/inference. + Tag string // The model image tag + ModelFamilyName string // The name of the model family. + ImageAccessMode string // Defines where the Image is Public or Private. + + DiskStorageRequirement string // Disk storage requirements for the model. + GPUCountRequirement string // Number of GPUs required for the Preset. Used for inference. + TotalGPUMemoryRequirement string // Total GPU memory required for the Preset. Used for inference. + PerGPUMemoryRequirement string // GPU memory required per GPU. Used for inference. + TuningPerGPUMemoryRequirement map[string]int // Min GPU memory per tuning method (batch size 1). Used for tuning. + WorldSize int // Defines the number of processes required for distributed inference. + + RuntimeParam + // ReadinessTimeout defines the maximum duration for creating the workload. // This timeout accommodates the size of the image, ensuring pull completion // even under slower network conditions or unforeseen delays. ReadinessTimeout time.Duration - WorldSize int // Defines the number of processes required for distributed inference. - Tag string // The model image tag +} + +// RuntimeParam defines the llm runtime parameters. +type RuntimeParam struct { + Transformers HuggingfaceTransformersParam + VLLM VLLMParam +} + +type HuggingfaceTransformersParam struct { + BaseCommand string // The initial command (e.g., 'torchrun', 'accelerate launch') used in the command line. + TorchRunParams map[string]string // Parameters for configuring the torchrun command. + TorchRunRdzvParams map[string]string // Optional rendezvous parameters for distributed training/inference using torchrun (elastic). + InferenceMainFile string // The main file for inference. + ModelRunParams map[string]string // Parameters for running the model training/inference. +} + +type VLLMParam struct { + BaseCommand string + // The model name used in the openai serving API. + // see https://platform.openai.com/docs/api-reference/chat/create#chat-create-model. + ModelName string + // Parameters for distributed inference. + DistributionParams map[string]string + // Parameters for running the model training/inference. + ModelRunParams map[string]string +} + +func (p *PresetParam) DeepCopy() *PresetParam { + if p == nil { + return nil + } + out := new(PresetParam) + *out = *p + out.RuntimeParam = p.RuntimeParam.DeepCopy() + out.TuningPerGPUMemoryRequirement = make(map[string]int, len(p.TuningPerGPUMemoryRequirement)) + for k, v := range p.TuningPerGPUMemoryRequirement { + out.TuningPerGPUMemoryRequirement[k] = v + } + return out +} + +func (rp *RuntimeParam) DeepCopy() RuntimeParam { + if rp == nil { + return RuntimeParam{} + } + out := RuntimeParam{} + out.Transformers = rp.Transformers.DeepCopy() + out.VLLM = rp.VLLM.DeepCopy() + return out +} + +func (h *HuggingfaceTransformersParam) DeepCopy() HuggingfaceTransformersParam { + if h == nil { + return HuggingfaceTransformersParam{} + } + out := HuggingfaceTransformersParam{} + out.BaseCommand = h.BaseCommand + out.InferenceMainFile = h.InferenceMainFile + out.TorchRunParams = make(map[string]string, len(h.TorchRunParams)) + for k, v := range h.TorchRunParams { + out.TorchRunParams[k] = v + } + out.TorchRunRdzvParams = make(map[string]string, len(h.TorchRunRdzvParams)) + for k, v := range h.TorchRunRdzvParams { + out.TorchRunRdzvParams[k] = v + } + out.ModelRunParams = make(map[string]string, len(h.ModelRunParams)) + for k, v := range h.ModelRunParams { + out.ModelRunParams[k] = v + } + return out +} + +func (v *VLLMParam) DeepCopy() VLLMParam { + if v == nil { + return VLLMParam{} + } + out := VLLMParam{} + out.BaseCommand = v.BaseCommand + out.ModelName = v.ModelName + out.DistributionParams = make(map[string]string, len(v.DistributionParams)) + for k, v := range v.DistributionParams { + out.DistributionParams[k] = v + } + out.ModelRunParams = make(map[string]string, len(v.ModelRunParams)) + for k, v := range v.ModelRunParams { + out.ModelRunParams[k] = v + } + return out +} + +// builds the container command: +// eg. torchrun baseCommand +func (p *PresetParam) GetInferenceCommand(runtime RuntimeName, skuNumGPUs string) []string { + switch runtime { + case RuntimeNameHuggingfaceTransformers: + torchCommand := utils.BuildCmdStr(p.Transformers.BaseCommand, p.Transformers.TorchRunParams, p.Transformers.TorchRunRdzvParams) + modelCommand := utils.BuildCmdStr(p.Transformers.InferenceMainFile, p.Transformers.ModelRunParams) + return utils.ShellCmd(torchCommand + " " + modelCommand) + case RuntimeNameVLLM: + if p.VLLM.ModelName != "" { + p.VLLM.ModelRunParams["served-model-name"] = p.VLLM.ModelName + } + p.VLLM.ModelRunParams["tensor-parallel-size"] = skuNumGPUs + modelCommand := utils.BuildCmdStr(p.VLLM.BaseCommand, p.VLLM.ModelRunParams) + return utils.ShellCmd(modelCommand) + default: + return nil + } } diff --git a/pkg/utils/common-preset.go b/pkg/utils/common-preset.go index 3a43f6d0d..5af7936a8 100644 --- a/pkg/utils/common-preset.go +++ b/pkg/utils/common-preset.go @@ -3,8 +3,8 @@ package utils import ( - "github.com/kaito-project/kaito/pkg/utils/plugin" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" ) const ( @@ -66,12 +66,14 @@ func ConfigSHMVolume(instanceCount int) (corev1.Volume, corev1.VolumeMount) { // Signifies multinode inference requirement if instanceCount > 1 { + size := resource.MustParse("4Gi") // Append share memory volume to any existing volumes volume = corev1.Volume{ Name: "dshm", VolumeSource: corev1.VolumeSource{ EmptyDir: &corev1.EmptyDirVolumeSource{ - Medium: "Memory", + Medium: "Memory", + SizeLimit: &size, }, }, } @@ -150,7 +152,3 @@ func ConfigAdapterVolume() (corev1.Volume, corev1.VolumeMount) { } return volume, volumeMount } - -func IsValidPreset(preset string) bool { - return plugin.KaitoModelRegister.Has(preset) -} diff --git a/pkg/utils/common.go b/pkg/utils/common.go index af0c7e4cf..cda6e7d9a 100644 --- a/pkg/utils/common.go +++ b/pkg/utils/common.go @@ -68,13 +68,15 @@ func MergeConfigMaps(baseMap, overrideMap map[string]string) map[string]string { return merged } -func BuildCmdStr(baseCommand string, runParams map[string]string) string { +func BuildCmdStr(baseCommand string, runParams ...map[string]string) string { updatedBaseCommand := baseCommand - for key, value := range runParams { - if value == "" { - updatedBaseCommand = fmt.Sprintf("%s --%s", updatedBaseCommand, key) - } else { - updatedBaseCommand = fmt.Sprintf("%s --%s=%s", updatedBaseCommand, key, value) + for _, runParam := range runParams { + for key, value := range runParam { + if value == "" { + updatedBaseCommand = fmt.Sprintf("%s --%s", updatedBaseCommand, key) + } else { + updatedBaseCommand = fmt.Sprintf("%s --%s=%s", updatedBaseCommand, key, value) + } } } diff --git a/pkg/utils/common_test.go b/pkg/utils/common_test.go index e23997692..b214150e7 100644 --- a/pkg/utils/common_test.go +++ b/pkg/utils/common_test.go @@ -2,7 +2,6 @@ package utils import ( "context" - "sigs.k8s.io/controller-runtime/pkg/client" "testing" "github.com/stretchr/testify/assert" @@ -12,6 +11,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/kubernetes/scheme" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" ) diff --git a/pkg/utils/consts/consts.go b/pkg/utils/consts/consts.go index 6756302d5..bcb1b8c58 100644 --- a/pkg/utils/consts/consts.go +++ b/pkg/utils/consts/consts.go @@ -11,7 +11,6 @@ const ( // RAGEngineFinalizer is used to make sure that ragengine controller handles garbage collection. RAGEngineFinalizer = "ragengine.finalizer.kaito.sh" DefaultReleaseNamespaceEnvVar = "RELEASE_NAMESPACE" - FeatureFlagKarpenter = "Karpenter" AzureCloudName = "azure" AWSCloudName = "aws" GPUString = "gpu" @@ -20,6 +19,10 @@ const ( GiBToBytes = 1024 * 1024 * 1024 // Conversion factor from GiB to bytes NvidiaGPU = "nvidia.com/gpu" + // Feature flags + FeatureFlagKarpenter = "Karpenter" + FeatureFlagVLLM = "vLLM" + // Nodeclaim related consts KaitoNodePoolName = "kaito" LabelNodePool = "karpenter.sh/nodepool" diff --git a/pkg/utils/plugin/plugin.go b/pkg/utils/plugin/plugin.go index 35706cb9c..62f048265 100644 --- a/pkg/utils/plugin/plugin.go +++ b/pkg/utils/plugin/plugin.go @@ -60,3 +60,7 @@ func (reg *ModelRegister) Has(name string) bool { _, ok := reg.models[name] return ok } + +func IsValidPreset(preset string) bool { + return KaitoModelRegister.Has(preset) +} diff --git a/pkg/utils/test/testModel.go b/pkg/utils/test/testModel.go index 2a820191f..bd1f029d8 100644 --- a/pkg/utils/test/testModel.go +++ b/pkg/utils/test/testModel.go @@ -15,8 +15,16 @@ type testModel struct{} func (*testModel) GetInferenceParameters() *model.PresetParam { return &model.PresetParam{ GPUCountRequirement: "1", - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: "python3", + RuntimeParam: model.RuntimeParam{ + VLLM: model.VLLMParam{ + BaseCommand: "python3 /workspace/vllm/inference_api.py", + }, + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: "accelerate launch", + InferenceMainFile: "/workspace/tfs/inference_api.py", + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, } } func (*testModel) GetTuningParameters() *model.PresetParam { @@ -37,8 +45,16 @@ type testDistributedModel struct{} func (*testDistributedModel) GetInferenceParameters() *model.PresetParam { return &model.PresetParam{ GPUCountRequirement: "1", - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: "python3", + RuntimeParam: model.RuntimeParam{ + VLLM: model.VLLMParam{ + BaseCommand: "python3 /workspace/vllm/inference_api.py", + }, + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: "accelerate launch", + InferenceMainFile: "/workspace/tfs/inference_api.py", + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, } } func (*testDistributedModel) GetTuningParameters() *model.PresetParam { diff --git a/pkg/utils/test/testUtils.go b/pkg/utils/test/testUtils.go index 912cd2bc0..b0f728f6f 100644 --- a/pkg/utils/test/testUtils.go +++ b/pkg/utils/test/testUtils.go @@ -6,6 +6,7 @@ package test import ( "github.com/aws/karpenter-core/pkg/apis/v1alpha5" "github.com/kaito-project/kaito/api/v1alpha1" + "github.com/kaito-project/kaito/pkg/model" "github.com/samber/lo" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" @@ -139,6 +140,31 @@ var ( }, }, } + MockWorkspaceWithPresetVLLM = &v1alpha1.Workspace{ + ObjectMeta: metav1.ObjectMeta{ + Name: "testWorkspace", + Namespace: "kaito", + Annotations: map[string]string{ + v1alpha1.AnnotationWorkspaceRuntime: string(model.RuntimeNameVLLM), + }, + }, + Resource: v1alpha1.ResourceSpec{ + Count: &gpuNodeCount, + InstanceType: "Standard_NC12s_v3", + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "apps": "test", + }, + }, + }, + Inference: &v1alpha1.InferenceSpec{ + Preset: &v1alpha1.PresetSpec{ + PresetMeta: v1alpha1.PresetMeta{ + Name: "test-model", + }, + }, + }, + } ) var MockWorkspaceWithPresetHash = "89ae127050ec264a5ce84db48ef7226574cdf1299e6bd27fe90b927e34cc8adb" diff --git a/pkg/workspace/controllers/workspace_controller.go b/pkg/workspace/controllers/workspace_controller.go index 63173d5b2..80686be86 100644 --- a/pkg/workspace/controllers/workspace_controller.go +++ b/pkg/workspace/controllers/workspace_controller.go @@ -783,7 +783,7 @@ func (c *WorkspaceReconciler) applyInference(ctx context.Context, wObj *kaitov1a } else if apierrors.IsNotFound(err) { var workloadObj client.Object // Need to create a new workload - workloadObj, err = inference.CreatePresetInference(ctx, wObj, revisionStr, inferenceParam, model.SupportDistributedInference(), c.Client) + workloadObj, err = inference.CreatePresetInference(ctx, wObj, revisionStr, model, c.Client) if err != nil { return } diff --git a/pkg/workspace/inference/preset-inference-types.go b/pkg/workspace/inference/preset-inference-types.go index d3157262e..205d0b4b3 100644 --- a/pkg/workspace/inference/preset-inference-types.go +++ b/pkg/workspace/inference/preset-inference-types.go @@ -54,5 +54,8 @@ var ( "gpu_ids": DefaultGPUIds, } + DefaultVLLMCommand = "python3 /workspace/vllm/inference_api.py" + DefautTransformersMainFile = "/workspace/tfs/inference_api.py" + DefaultImagePullSecrets = []corev1.LocalObjectReference{} ) diff --git a/pkg/workspace/inference/preset-inferences.go b/pkg/workspace/inference/preset-inferences.go index 8961f5860..9c4699f3d 100644 --- a/pkg/workspace/inference/preset-inferences.go +++ b/pkg/workspace/inference/preset-inferences.go @@ -11,6 +11,7 @@ import ( "github.com/kaito-project/kaito/pkg/utils" "github.com/kaito-project/kaito/pkg/utils/consts" + "github.com/kaito-project/kaito/api/v1alpha1" kaitov1alpha1 "github.com/kaito-project/kaito/api/v1alpha1" "github.com/kaito-project/kaito/pkg/model" "github.com/kaito-project/kaito/pkg/utils/resources" @@ -23,9 +24,8 @@ import ( ) const ( - ProbePath = "/health" - Port5000 = 5000 - InferenceFile = "inference_api.py" + ProbePath = "/health" + Port5000 = 5000 ) var ( @@ -71,7 +71,12 @@ var ( } ) -func updateTorchParamsForDistributedInference(ctx context.Context, kubeClient client.Client, wObj *kaitov1alpha1.Workspace, inferenceObj *model.PresetParam) error { +func updateTorchParamsForDistributedInference(ctx context.Context, kubeClient client.Client, wObj *kaitov1alpha1.Workspace, inferenceParam *model.PresetParam) error { + runtimeName := v1alpha1.GetWorkspaceRuntimeName(wObj) + if runtimeName != model.RuntimeNameHuggingfaceTransformers { + return fmt.Errorf("distributed inference is not supported for runtime %s", runtimeName) + } + existingService := &corev1.Service{} err := resources.GetResource(ctx, wObj.Name, wObj.Namespace, kubeClient, existingService) if err != nil { @@ -79,18 +84,18 @@ func updateTorchParamsForDistributedInference(ctx context.Context, kubeClient cl } nodes := *wObj.Resource.Count - inferenceObj.TorchRunParams["nnodes"] = strconv.Itoa(nodes) - inferenceObj.TorchRunParams["nproc_per_node"] = strconv.Itoa(inferenceObj.WorldSize / nodes) + inferenceParam.Transformers.TorchRunParams["nnodes"] = strconv.Itoa(nodes) + inferenceParam.Transformers.TorchRunParams["nproc_per_node"] = strconv.Itoa(inferenceParam.WorldSize / nodes) if nodes > 1 { - inferenceObj.TorchRunParams["node_rank"] = "$(echo $HOSTNAME | grep -o '[^-]*$')" - inferenceObj.TorchRunParams["master_addr"] = existingService.Spec.ClusterIP - inferenceObj.TorchRunParams["master_port"] = "29500" - } - if inferenceObj.TorchRunRdzvParams != nil { - inferenceObj.TorchRunRdzvParams["max_restarts"] = "3" - inferenceObj.TorchRunRdzvParams["rdzv_id"] = "job" - inferenceObj.TorchRunRdzvParams["rdzv_backend"] = "c10d" - inferenceObj.TorchRunRdzvParams["rdzv_endpoint"] = + inferenceParam.Transformers.TorchRunParams["node_rank"] = "$(echo $HOSTNAME | grep -o '[^-]*$')" + inferenceParam.Transformers.TorchRunParams["master_addr"] = existingService.Spec.ClusterIP + inferenceParam.Transformers.TorchRunParams["master_port"] = "29500" + } + if inferenceParam.Transformers.TorchRunRdzvParams != nil { + inferenceParam.Transformers.TorchRunRdzvParams["max_restarts"] = "3" + inferenceParam.Transformers.TorchRunRdzvParams["rdzv_id"] = "job" + inferenceParam.Transformers.TorchRunRdzvParams["rdzv_backend"] = "c10d" + inferenceParam.Transformers.TorchRunRdzvParams["rdzv_endpoint"] = fmt.Sprintf("%s-0.%s-headless.%s.svc.cluster.local:29500", wObj.Name, wObj.Name, wObj.Namespace) } return nil @@ -123,41 +128,57 @@ func GetInferenceImageInfo(ctx context.Context, workspaceObj *kaitov1alpha1.Work } func CreatePresetInference(ctx context.Context, workspaceObj *kaitov1alpha1.Workspace, revisionNum string, - inferenceObj *model.PresetParam, supportDistributedInference bool, kubeClient client.Client) (client.Object, error) { - if inferenceObj.TorchRunParams != nil && supportDistributedInference { - if err := updateTorchParamsForDistributedInference(ctx, kubeClient, workspaceObj, inferenceObj); err != nil { + model model.Model, kubeClient client.Client) (client.Object, error) { + inferenceParam := model.GetInferenceParameters().DeepCopy() + + if model.SupportDistributedInference() { + if err := updateTorchParamsForDistributedInference(ctx, kubeClient, workspaceObj, inferenceParam); err != nil { // klog.ErrorS(err, "failed to update torch params", "workspace", workspaceObj) return nil, err } } + // resource requirements + skuNumGPUs, err := utils.GetSKUNumGPUs(ctx, kubeClient, workspaceObj.Status.WorkerNodes, + workspaceObj.Resource.InstanceType, inferenceParam.GPUCountRequirement) + if err != nil { + return nil, fmt.Errorf("failed to get SKU num GPUs: %v", err) + } + resourceReq := corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceName(resources.CapacityNvidiaGPU): resource.MustParse(skuNumGPUs), + }, + Limits: corev1.ResourceList{ + corev1.ResourceName(resources.CapacityNvidiaGPU): resource.MustParse(skuNumGPUs), + }, + } + skuGPUCount, _ := strconv.Atoi(skuNumGPUs) + + // additional volume var volumes []corev1.Volume var volumeMounts []corev1.VolumeMount - shmVolume, shmVolumeMount := utils.ConfigSHMVolume(*workspaceObj.Resource.Count) + // add share memory for cross process communication + shmVolume, shmVolumeMount := utils.ConfigSHMVolume(skuGPUCount) if shmVolume.Name != "" { volumes = append(volumes, shmVolume) } if shmVolumeMount.Name != "" { volumeMounts = append(volumeMounts, shmVolumeMount) } - if len(workspaceObj.Inference.Adapters) > 0 { adapterVolume, adapterVolumeMount := utils.ConfigAdapterVolume() volumes = append(volumes, adapterVolume) volumeMounts = append(volumeMounts, adapterVolumeMount) } - skuNumGPUs, err := utils.GetSKUNumGPUs(ctx, kubeClient, workspaceObj.Status.WorkerNodes, - workspaceObj.Resource.InstanceType, inferenceObj.GPUCountRequirement) - if err != nil { - return nil, fmt.Errorf("failed to get SKU num GPUs: %v", err) - } + // inference command + runtimeName := kaitov1alpha1.GetWorkspaceRuntimeName(workspaceObj) + commands := inferenceParam.GetInferenceCommand(runtimeName, skuNumGPUs) - commands, resourceReq := prepareInferenceParameters(ctx, inferenceObj, skuNumGPUs) - image, imagePullSecrets := GetInferenceImageInfo(ctx, workspaceObj, inferenceObj) + image, imagePullSecrets := GetInferenceImageInfo(ctx, workspaceObj, inferenceParam) var depObj client.Object - if supportDistributedInference { + if model.SupportDistributedInference() { depObj = manifests.GenerateStatefulSetManifest(ctx, workspaceObj, image, imagePullSecrets, *workspaceObj.Resource.Count, commands, containerPorts, livenessProbe, readinessProbe, resourceReq, tolerations, volumes, volumeMounts) } else { @@ -170,25 +191,3 @@ func CreatePresetInference(ctx context.Context, workspaceObj *kaitov1alpha1.Work } return depObj, nil } - -// prepareInferenceParameters builds a PyTorch command: -// torchrun baseCommand -// and sets the GPU resources required for inference. -// Returns the command and resource configuration. -func prepareInferenceParameters(ctx context.Context, inferenceObj *model.PresetParam, skuNumGPUs string) ([]string, corev1.ResourceRequirements) { - torchCommand := utils.BuildCmdStr(inferenceObj.BaseCommand, inferenceObj.TorchRunParams) - torchCommand = utils.BuildCmdStr(torchCommand, inferenceObj.TorchRunRdzvParams) - modelCommand := utils.BuildCmdStr(InferenceFile, inferenceObj.ModelRunParams) - commands := utils.ShellCmd(torchCommand + " " + modelCommand) - - resourceRequirements := corev1.ResourceRequirements{ - Requests: corev1.ResourceList{ - corev1.ResourceName(resources.CapacityNvidiaGPU): resource.MustParse(skuNumGPUs), - }, - Limits: corev1.ResourceList{ - corev1.ResourceName(resources.CapacityNvidiaGPU): resource.MustParse(skuNumGPUs), - }, - } - - return commands, resourceRequirements -} diff --git a/pkg/workspace/inference/preset-inferences_test.go b/pkg/workspace/inference/preset-inferences_test.go index 5a6651b2f..abd3f5385 100644 --- a/pkg/workspace/inference/preset-inferences_test.go +++ b/pkg/workspace/inference/preset-inferences_test.go @@ -10,12 +10,10 @@ import ( "strings" "testing" - "github.com/kaito-project/kaito/pkg/utils/consts" - "github.com/kaito-project/kaito/api/v1alpha1" + "github.com/kaito-project/kaito/pkg/utils/consts" "github.com/kaito-project/kaito/pkg/utils/test" - "github.com/kaito-project/kaito/pkg/model" "github.com/kaito-project/kaito/pkg/utils/plugin" "github.com/stretchr/testify/mock" appsv1 "k8s.io/api/apps/v1" @@ -28,6 +26,7 @@ var ValidStrength string = "0.5" func TestCreatePresetInference(t *testing.T) { test.RegisterTestModel() testcases := map[string]struct { + workspace *v1alpha1.Workspace nodeCount int modelName string callMocks func(c *test.MockClient) @@ -37,7 +36,35 @@ func TestCreatePresetInference(t *testing.T) { expectedVolume string }{ - "test-model": { + "test-model/vllm": { + workspace: test.MockWorkspaceWithPresetVLLM, + nodeCount: 1, + modelName: "test-model", + callMocks: func(c *test.MockClient) { + c.On("Create", mock.IsType(context.TODO()), mock.IsType(&appsv1.Deployment{}), mock.Anything).Return(nil) + }, + workload: "Deployment", + // No BaseCommand, TorchRunParams, TorchRunRdzvParams, or ModelRunParams + // So expected cmd consists of shell command and inference file + expectedCmd: "/bin/sh -c python3 /workspace/vllm/inference_api.py --tensor-parallel-size=2", + hasAdapters: false, + }, + + "test-model-with-adapters/vllm": { + workspace: test.MockWorkspaceWithPresetVLLM, + nodeCount: 1, + modelName: "test-model", + callMocks: func(c *test.MockClient) { + c.On("Create", mock.IsType(context.TODO()), mock.IsType(&appsv1.Deployment{}), mock.Anything).Return(nil) + }, + workload: "Deployment", + expectedCmd: "/bin/sh -c python3 /workspace/vllm/inference_api.py --tensor-parallel-size=2", + hasAdapters: true, + expectedVolume: "adapter-volume", + }, + + "test-model/transformers": { + workspace: test.MockWorkspaceWithPreset, nodeCount: 1, modelName: "test-model", callMocks: func(c *test.MockClient) { @@ -46,11 +73,12 @@ func TestCreatePresetInference(t *testing.T) { workload: "Deployment", // No BaseCommand, TorchRunParams, TorchRunRdzvParams, or ModelRunParams // So expected cmd consists of shell command and inference file - expectedCmd: "/bin/sh -c python3 inference_api.py", + expectedCmd: "/bin/sh -c accelerate launch /workspace/tfs/inference_api.py", hasAdapters: false, }, - "test-distributed-model": { + "test-distributed-model/transformers": { + workspace: test.MockWorkspaceDistributedModel, nodeCount: 1, modelName: "test-distributed-model", callMocks: func(c *test.MockClient) { @@ -58,18 +86,19 @@ func TestCreatePresetInference(t *testing.T) { c.On("Create", mock.IsType(context.TODO()), mock.IsType(&appsv1.StatefulSet{}), mock.Anything).Return(nil) }, workload: "StatefulSet", - expectedCmd: "/bin/sh -c python3 inference_api.py", + expectedCmd: "/bin/sh -c accelerate launch --nnodes=1 --nproc_per_node=0 --max_restarts=3 --rdzv_id=job --rdzv_backend=c10d --rdzv_endpoint=testWorkspace-0.testWorkspace-headless.kaito.svc.cluster.local:29500 /workspace/tfs/inference_api.py", hasAdapters: false, }, "test-model-with-adapters": { + workspace: test.MockWorkspaceWithPreset, nodeCount: 1, modelName: "test-model", callMocks: func(c *test.MockClient) { c.On("Create", mock.IsType(context.TODO()), mock.IsType(&appsv1.Deployment{}), mock.Anything).Return(nil) }, workload: "Deployment", - expectedCmd: "/bin/sh -c python3 inference_api.py", + expectedCmd: "/bin/sh -c accelerate launch /workspace/tfs/inference_api.py", hasAdapters: true, expectedVolume: "adapter-volume", }, @@ -81,7 +110,7 @@ func TestCreatePresetInference(t *testing.T) { mockClient := test.NewClient() tc.callMocks(mockClient) - workspace := test.MockWorkspaceWithPreset + workspace := tc.workspace workspace.Resource.Count = &tc.nodeCount expectedSecrets := []string{"fake-secret"} if tc.hasAdapters { @@ -97,15 +126,8 @@ func TestCreatePresetInference(t *testing.T) { } } - useHeadlessSvc := false - - var inferenceObj *model.PresetParam model := plugin.KaitoModelRegister.MustGet(tc.modelName) - inferenceObj = model.GetInferenceParameters() - if strings.Contains(tc.modelName, "distributed") { - useHeadlessSvc = true - } svc := &corev1.Service{ ObjectMeta: v1.ObjectMeta{ Name: workspace.Name, @@ -117,7 +139,7 @@ func TestCreatePresetInference(t *testing.T) { } mockClient.CreateOrUpdateObjectInMap(svc) - createdObject, _ := CreatePresetInference(context.TODO(), workspace, "1", inferenceObj, useHeadlessSvc, mockClient) + createdObject, _ := CreatePresetInference(context.TODO(), workspace, test.MockWorkspaceWithPresetHash, model, mockClient) createdWorkload := "" switch createdObject.(type) { case *appsv1.Deployment: diff --git a/pkg/workspace/tuning/preset-tuning.go b/pkg/workspace/tuning/preset-tuning.go index 5703b1845..94463956c 100644 --- a/pkg/workspace/tuning/preset-tuning.go +++ b/pkg/workspace/tuning/preset-tuning.go @@ -27,7 +27,7 @@ import ( const ( Port5000 = int32(5000) - TuningFile = "fine_tuning.py" + TuningFile = "/workspace/tfs/fine_tuning.py" DefaultBaseDir = "/mnt" DefaultOutputVolumePath = "/mnt/output" ) @@ -491,7 +491,7 @@ func handleURLDataSource(ctx context.Context, workspaceObj *kaitov1alpha1.Worksp } func prepareModelRunParameters(ctx context.Context, tuningObj *model.PresetParam) (string, error) { - modelCommand := utils.BuildCmdStr(TuningFile, tuningObj.ModelRunParams) + modelCommand := utils.BuildCmdStr(TuningFile, tuningObj.Transformers.ModelRunParams) return modelCommand, nil } @@ -501,14 +501,14 @@ func prepareModelRunParameters(ctx context.Context, tuningObj *model.PresetParam // Returns the command and resource configuration. func prepareTuningParameters(ctx context.Context, wObj *kaitov1alpha1.Workspace, modelCommand string, tuningObj *model.PresetParam, skuNumGPUs string) ([]string, corev1.ResourceRequirements) { - if tuningObj.TorchRunParams == nil { - tuningObj.TorchRunParams = make(map[string]string) + hfParam := tuningObj.Transformers // Only support Huggingface for now + if hfParam.TorchRunParams == nil { + hfParam.TorchRunParams = make(map[string]string) } // Set # of processes to GPU Count numProcesses := getInstanceGPUCount(wObj.Resource.InstanceType) - tuningObj.TorchRunParams["num_processes"] = fmt.Sprintf("%d", numProcesses) - torchCommand := utils.BuildCmdStr(tuningObj.BaseCommand, tuningObj.TorchRunParams) - torchCommand = utils.BuildCmdStr(torchCommand, tuningObj.TorchRunRdzvParams) + hfParam.TorchRunParams["num_processes"] = fmt.Sprintf("%d", numProcesses) + torchCommand := utils.BuildCmdStr(hfParam.BaseCommand, hfParam.TorchRunParams, hfParam.TorchRunRdzvParams) commands := utils.ShellCmd(torchCommand + " " + modelCommand) resourceRequirements := corev1.ResourceRequirements{ diff --git a/pkg/workspace/tuning/preset-tuning_test.go b/pkg/workspace/tuning/preset-tuning_test.go index 6e1ede930..1522595e6 100644 --- a/pkg/workspace/tuning/preset-tuning_test.go +++ b/pkg/workspace/tuning/preset-tuning_test.go @@ -416,9 +416,13 @@ func TestPrepareTuningParameters(t *testing.T) { }, modelCommand: "model-command", tuningObj: &model.PresetParam{ - BaseCommand: "python train.py", - TorchRunParams: map[string]string{}, - TorchRunRdzvParams: map[string]string{}, + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: "python train.py", + TorchRunParams: map[string]string{}, + TorchRunRdzvParams: map[string]string{}, + }, + }, GPUCountRequirement: "2", }, expectedCommands: []string{"/bin/sh", "-c", "python train.py --num_processes=1 model-command"}, diff --git a/presets/workspace/models/falcon/model.go b/presets/workspace/models/falcon/model.go index 88487fd63..63f8452ec 100644 --- a/presets/workspace/models/falcon/model.go +++ b/presets/workspace/models/falcon/model.go @@ -38,17 +38,22 @@ var ( PresetFalcon40BInstructModel = PresetFalcon40BModel + "-instruct" PresetFalconTagMap = map[string]string{ - "Falcon7B": "0.0.6", - "Falcon7BInstruct": "0.0.6", - "Falcon40B": "0.0.7", - "Falcon40BInstruct": "0.0.7", + "Falcon7B": "0.0.7", + "Falcon7BInstruct": "0.0.7", + "Falcon40B": "0.0.8", + "Falcon40BInstruct": "0.0.8", } baseCommandPresetFalconInference = "accelerate launch" - baseCommandPresetFalconTuning = "python3 metrics_server.py & accelerate launch" + baseCommandPresetFalconTuning = "cd /workspace/tfs/ && python3 metrics_server.py & accelerate launch" falconRunParams = map[string]string{ - "torch_dtype": "bfloat16", - "pipeline": "text-generation", + "torch_dtype": "bfloat16", + "pipeline": "text-generation", + "chat_template": "/workspace/chat_templates/falcon-instruct.jinja", + } + falconRunParamsVLLM = map[string]string{ + "dtype": "float16", + "chat-template": "/workspace/chat_templates/falcon-instruct.jinja", } ) @@ -64,11 +69,21 @@ func (*falcon7b) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "14Gi", PerGPUMemoryRequirement: "0Gi", // We run Falcon using native vertical model parallel, no per GPU memory requirement. - TorchRunParams: inference.DefaultAccelerateParams, - ModelRunParams: falconRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetFalconInference, - Tag: PresetFalconTagMap["Falcon7B"], + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetFalconInference, + TorchRunParams: inference.DefaultAccelerateParams, + InferenceMainFile: inference.DefautTransformersMainFile, + ModelRunParams: falconRunParams, + }, + VLLM: model.VLLMParam{ + BaseCommand: inference.DefaultVLLMCommand, + ModelName: "falcon-7b", + ModelRunParams: falconRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + Tag: PresetFalconTagMap["Falcon7B"], } } func (*falcon7b) GetTuningParameters() *model.PresetParam { @@ -79,10 +94,14 @@ func (*falcon7b) GetTuningParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "16Gi", PerGPUMemoryRequirement: "16Gi", - TorchRunParams: tuning.DefaultAccelerateParams, - //ModelRunPrams: falconRunTuningParams, // TODO + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetFalconTuning, + TorchRunParams: tuning.DefaultAccelerateParams, + //ModelRunPrams: falconRunTuningParams, // TODO + }, + }, ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetFalconTuning, Tag: PresetFalconTagMap["Falcon7B"], TuningPerGPUMemoryRequirement: map[string]int{"qlora": 16}, } @@ -107,11 +126,21 @@ func (*falcon7bInst) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "14Gi", PerGPUMemoryRequirement: "0Gi", // We run Falcon using native vertical model parallel, no per GPU memory requirement. - TorchRunParams: inference.DefaultAccelerateParams, - ModelRunParams: falconRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetFalconInference, - Tag: PresetFalconTagMap["Falcon7BInstruct"], + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetFalconInference, + TorchRunParams: inference.DefaultAccelerateParams, + InferenceMainFile: inference.DefautTransformersMainFile, + ModelRunParams: falconRunParams, + }, + VLLM: model.VLLMParam{ + BaseCommand: inference.DefaultVLLMCommand, + ModelName: "falcon-7b-instruct", + ModelRunParams: falconRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + Tag: PresetFalconTagMap["Falcon7BInstruct"], } } @@ -137,13 +166,22 @@ func (*falcon40b) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "2", TotalGPUMemoryRequirement: "90Gi", PerGPUMemoryRequirement: "0Gi", // We run Falcon using native vertical model parallel, no per GPU memory requirement. - TorchRunParams: inference.DefaultAccelerateParams, - ModelRunParams: falconRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetFalconInference, - Tag: PresetFalconTagMap["Falcon40B"], + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetFalconInference, + TorchRunParams: inference.DefaultAccelerateParams, + InferenceMainFile: inference.DefautTransformersMainFile, + ModelRunParams: falconRunParams, + }, + VLLM: model.VLLMParam{ + BaseCommand: inference.DefaultVLLMCommand, + ModelName: "falcon-40b", + ModelRunParams: falconRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + Tag: PresetFalconTagMap["Falcon40B"], } - } func (*falcon40b) GetTuningParameters() *model.PresetParam { return &model.PresetParam{ @@ -153,10 +191,14 @@ func (*falcon40b) GetTuningParameters() *model.PresetParam { GPUCountRequirement: "2", TotalGPUMemoryRequirement: "90Gi", PerGPUMemoryRequirement: "16Gi", - TorchRunParams: tuning.DefaultAccelerateParams, - //ModelRunPrams: falconRunTuningParams, // TODO + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetFalconTuning, + TorchRunParams: tuning.DefaultAccelerateParams, + //ModelRunPrams: falconRunTuningParams, // TODO + }, + }, ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetFalconTuning, Tag: PresetFalconTagMap["Falcon40B"], } } @@ -179,11 +221,21 @@ func (*falcon40bInst) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "2", TotalGPUMemoryRequirement: "90Gi", PerGPUMemoryRequirement: "0Gi", // We run Falcon using native vertical model parallel, no per GPU memory requirement. - TorchRunParams: inference.DefaultAccelerateParams, - ModelRunParams: falconRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetFalconInference, - Tag: PresetFalconTagMap["Falcon40BInstruct"], + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetFalconInference, + TorchRunParams: inference.DefaultAccelerateParams, + InferenceMainFile: inference.DefautTransformersMainFile, + ModelRunParams: falconRunParams, + }, + VLLM: model.VLLMParam{ + BaseCommand: inference.DefaultVLLMCommand, + ModelName: "falcon-40b-instruct", + ModelRunParams: falconRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + Tag: PresetFalconTagMap["Falcon40BInstruct"], } } func (*falcon40bInst) GetTuningParameters() *model.PresetParam { diff --git a/presets/workspace/models/llama2/model.go b/presets/workspace/models/llama2/model.go index 7a81d679f..a9fb5a247 100644 --- a/presets/workspace/models/llama2/model.go +++ b/presets/workspace/models/llama2/model.go @@ -46,15 +46,19 @@ func (*llama2Text7b) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "14Gi", PerGPUMemoryRequirement: "14Gi", // We run llama2 using tensor parallelism, the memory of each GPU needs to be bigger than the tensor shard size. - TorchRunParams: inference.DefaultTorchRunParams, - TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams, - ModelRunParams: llamaRunParams, - ReadinessTimeout: time.Duration(10) * time.Minute, - BaseCommand: baseCommandPresetLlama, - WorldSize: 1, + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetLlama, + TorchRunParams: inference.DefaultTorchRunParams, + TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams, + InferenceMainFile: "inference_api.py", + ModelRunParams: llamaRunParams, + }, + }, + ReadinessTimeout: time.Duration(10) * time.Minute, + WorldSize: 1, // Tag: llama has private image access mode. The image tag is determined by the user. } - } func (*llama2Text7b) GetTuningParameters() *model.PresetParam { return nil // Currently doesn't support fine-tuning @@ -78,12 +82,17 @@ func (*llama2Text13b) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "2", TotalGPUMemoryRequirement: "30Gi", PerGPUMemoryRequirement: "15Gi", // We run llama2 using tensor parallelism, the memory of each GPU needs to be bigger than the tensor shard size. - TorchRunParams: inference.DefaultTorchRunParams, - TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams, - ModelRunParams: llamaRunParams, - ReadinessTimeout: time.Duration(20) * time.Minute, - BaseCommand: baseCommandPresetLlama, - WorldSize: 2, + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetLlama, + TorchRunParams: inference.DefaultTorchRunParams, + TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams, + InferenceMainFile: "inference_api.py", + ModelRunParams: llamaRunParams, + }, + }, + ReadinessTimeout: time.Duration(20) * time.Minute, + WorldSize: 2, // Tag: llama has private image access mode. The image tag is determined by the user. } } @@ -109,12 +118,17 @@ func (*llama2Text70b) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "8", TotalGPUMemoryRequirement: "152Gi", PerGPUMemoryRequirement: "19Gi", // We run llama2 using tensor parallelism, the memory of each GPU needs to be bigger than the tensor shard size. - TorchRunParams: inference.DefaultTorchRunParams, - TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams, - ModelRunParams: llamaRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetLlama, - WorldSize: 8, + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetLlama, + TorchRunParams: inference.DefaultTorchRunParams, + TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams, + InferenceMainFile: "inference_api.py", + ModelRunParams: llamaRunParams, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + WorldSize: 8, // Tag: llama has private image access mode. The image tag is determined by the user. } } diff --git a/presets/workspace/models/llama2chat/model.go b/presets/workspace/models/llama2chat/model.go index 735ecc013..431d04b0c 100644 --- a/presets/workspace/models/llama2chat/model.go +++ b/presets/workspace/models/llama2chat/model.go @@ -46,12 +46,17 @@ func (*llama2Chat7b) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "16Gi", PerGPUMemoryRequirement: "14Gi", // We run llama2 using tensor parallelism, the memory of each GPU needs to be bigger than the tensor shard size. - TorchRunParams: inference.DefaultTorchRunParams, - TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams, - ModelRunParams: llamaRunParams, - ReadinessTimeout: time.Duration(10) * time.Minute, - BaseCommand: baseCommandPresetLlama, - WorldSize: 1, + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetLlama, + TorchRunParams: inference.DefaultTorchRunParams, + TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams, + InferenceMainFile: "inference_api.py", + ModelRunParams: llamaRunParams, + }, + }, + ReadinessTimeout: time.Duration(10) * time.Minute, + WorldSize: 1, // Tag: llama has private image access mode. The image tag is determined by the user. } } @@ -77,12 +82,18 @@ func (*llama2Chat13b) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "2", TotalGPUMemoryRequirement: "30Gi", PerGPUMemoryRequirement: "15Gi", // We run llama2 using tensor parallelism, the memory of each GPU needs to be bigger than the tensor shard size. - TorchRunParams: inference.DefaultTorchRunParams, - TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams, - ModelRunParams: llamaRunParams, - ReadinessTimeout: time.Duration(20) * time.Minute, - BaseCommand: baseCommandPresetLlama, - WorldSize: 2, + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetLlama, + TorchRunParams: inference.DefaultTorchRunParams, + TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams, + InferenceMainFile: "inference_api.py", + ModelRunParams: llamaRunParams, + }, + }, + ReadinessTimeout: time.Duration(20) * time.Minute, + + WorldSize: 2, // Tag: llama has private image access mode. The image tag is determined by the user. } } @@ -108,12 +119,17 @@ func (*llama2Chat70b) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "8", TotalGPUMemoryRequirement: "192Gi", PerGPUMemoryRequirement: "19Gi", // We run llama2 using tensor parallelism, the memory of each GPU needs to be bigger than the tensor shard size. - TorchRunParams: inference.DefaultTorchRunParams, - TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams, - ModelRunParams: llamaRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetLlama, - WorldSize: 8, + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetLlama, + TorchRunParams: inference.DefaultTorchRunParams, + TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams, + InferenceMainFile: "inference_api.py", + ModelRunParams: llamaRunParams, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + WorldSize: 8, // Tag: llama has private image access mode. The image tag is determined by the user. } } diff --git a/presets/workspace/models/mistral/model.go b/presets/workspace/models/mistral/model.go index 51a23b10c..f9e48dd3d 100644 --- a/presets/workspace/models/mistral/model.go +++ b/presets/workspace/models/mistral/model.go @@ -27,15 +27,20 @@ var ( PresetMistral7BInstructModel = PresetMistral7BModel + "-instruct" PresetMistralTagMap = map[string]string{ - "Mistral7B": "0.0.7", - "Mistral7BInstruct": "0.0.7", + "Mistral7B": "0.0.8", + "Mistral7BInstruct": "0.0.8", } baseCommandPresetMistralInference = "accelerate launch" - baseCommandPresetMistralTuning = "python3 metrics_server.py & accelerate launch" + baseCommandPresetMistralTuning = "cd /workspace/tfs/ && python3 metrics_server.py & accelerate launch" mistralRunParams = map[string]string{ - "torch_dtype": "bfloat16", - "pipeline": "text-generation", + "torch_dtype": "bfloat16", + "pipeline": "text-generation", + "chat_template": "/workspace/chat_templates/mistral-instruct.jinja", + } + mistralRunParamsVLLM = map[string]string{ + "dtype": "float16", + "chat-template": "/workspace/chat_templates/mistral-instruct.jinja", } ) @@ -51,11 +56,21 @@ func (*mistral7b) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "14Gi", PerGPUMemoryRequirement: "0Gi", // We run Mistral using native vertical model parallel, no per GPU memory requirement. - TorchRunParams: inference.DefaultAccelerateParams, - ModelRunParams: mistralRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetMistralInference, - Tag: PresetMistralTagMap["Mistral7B"], + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + TorchRunParams: inference.DefaultAccelerateParams, + ModelRunParams: mistralRunParams, + BaseCommand: baseCommandPresetMistralInference, + InferenceMainFile: inference.DefautTransformersMainFile, + }, + VLLM: model.VLLMParam{ + BaseCommand: inference.DefaultVLLMCommand, + ModelName: "mistral-7b", + ModelRunParams: mistralRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + Tag: PresetMistralTagMap["Mistral7B"], } } @@ -67,10 +82,14 @@ func (*mistral7b) GetTuningParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "16Gi", PerGPUMemoryRequirement: "16Gi", - //TorchRunParams: tuning.DefaultAccelerateParams, - //ModelRunParams: mistralRunParams, + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + //TorchRunParams: tuning.DefaultAccelerateParams, + //ModelRunParams: mistralRunParams, + BaseCommand: baseCommandPresetMistralTuning, + }, + }, ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetMistralTuning, Tag: PresetMistralTagMap["Mistral7B"], } } @@ -94,11 +113,21 @@ func (*mistral7bInst) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "16Gi", PerGPUMemoryRequirement: "0Gi", // We run mistral using native vertical model parallel, no per GPU memory requirement. - TorchRunParams: inference.DefaultAccelerateParams, - ModelRunParams: mistralRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetMistralInference, - Tag: PresetMistralTagMap["Mistral7BInstruct"], + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + TorchRunParams: inference.DefaultAccelerateParams, + ModelRunParams: mistralRunParams, + BaseCommand: baseCommandPresetMistralInference, + InferenceMainFile: inference.DefautTransformersMainFile, + }, + VLLM: model.VLLMParam{ + BaseCommand: inference.DefaultVLLMCommand, + ModelName: "mistral-7b-instruct", + ModelRunParams: mistralRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + Tag: PresetMistralTagMap["Mistral7BInstruct"], } } diff --git a/presets/workspace/models/phi2/model.go b/presets/workspace/models/phi2/model.go index 31f495c7e..eeb305fe5 100644 --- a/presets/workspace/models/phi2/model.go +++ b/presets/workspace/models/phi2/model.go @@ -22,15 +22,18 @@ var ( PresetPhi2Model = "phi-2" PresetPhiTagMap = map[string]string{ - "Phi2": "0.0.5", + "Phi2": "0.0.6", } baseCommandPresetPhiInference = "accelerate launch" - baseCommandPresetPhiTuning = "python3 metrics_server.py & accelerate launch" + baseCommandPresetPhiTuning = "cd /workspace/tfs/ && python3 metrics_server.py & accelerate launch" phiRunParams = map[string]string{ "torch_dtype": "float16", "pipeline": "text-generation", } + phiRunParamsVLLM = map[string]string{ + "dtype": "float16", + } ) var phiA phi2 @@ -45,11 +48,21 @@ func (*phi2) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "12Gi", PerGPUMemoryRequirement: "0Gi", // We run Phi using native vertical model parallel, no per GPU memory requirement. - TorchRunParams: inference.DefaultAccelerateParams, - ModelRunParams: phiRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetPhiInference, - Tag: PresetPhiTagMap["Phi2"], + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + TorchRunParams: inference.DefaultAccelerateParams, + ModelRunParams: phiRunParams, + BaseCommand: baseCommandPresetPhiInference, + InferenceMainFile: inference.DefautTransformersMainFile, + }, + VLLM: model.VLLMParam{ + BaseCommand: inference.DefaultVLLMCommand, + ModelName: "phi-2", + ModelRunParams: phiRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + Tag: PresetPhiTagMap["Phi2"], } } func (*phi2) GetTuningParameters() *model.PresetParam { @@ -60,10 +73,14 @@ func (*phi2) GetTuningParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "16Gi", PerGPUMemoryRequirement: "16Gi", - // TorchRunParams: inference.DefaultAccelerateParams, - // ModelRunParams: phiRunParams, + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + // TorchRunParams: inference.DefaultAccelerateParams, + // ModelRunParams: phiRunParams, + BaseCommand: baseCommandPresetPhiTuning, + }, + }, ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetPhiTuning, Tag: PresetPhiTagMap["Phi2"], } } diff --git a/presets/workspace/models/phi3/model.go b/presets/workspace/models/phi3/model.go index abf63003b..c4db6fd37 100644 --- a/presets/workspace/models/phi3/model.go +++ b/presets/workspace/models/phi3/model.go @@ -42,20 +42,23 @@ var ( PresetPhi3_5MiniInstruct = "phi-3.5-mini-instruct" PresetPhiTagMap = map[string]string{ - "Phi3Mini4kInstruct": "0.0.2", - "Phi3Mini128kInstruct": "0.0.2", - "Phi3Medium4kInstruct": "0.0.2", - "Phi3Medium128kInstruct": "0.0.2", + "Phi3Mini4kInstruct": "0.0.3", + "Phi3Mini128kInstruct": "0.0.3", + "Phi3Medium4kInstruct": "0.0.3", + "Phi3Medium128kInstruct": "0.0.3", "Phi3_5MiniInstruct": "0.0.1", } baseCommandPresetPhiInference = "accelerate launch" - baseCommandPresetPhiTuning = "python3 metrics_server.py & accelerate launch" + baseCommandPresetPhiTuning = "cd /workspace/tfs/ && python3 metrics_server.py & accelerate launch" phiRunParams = map[string]string{ "torch_dtype": "auto", "pipeline": "text-generation", "trust_remote_code": "", } + phiRunParamsVLLM = map[string]string{ + "dtype": "float16", + } ) var phi3MiniA phi3Mini4KInst @@ -70,11 +73,21 @@ func (*phi3Mini4KInst) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "9Gi", PerGPUMemoryRequirement: "0Gi", // We run Phi using native vertical model parallel, no per GPU memory requirement. - TorchRunParams: inference.DefaultAccelerateParams, - ModelRunParams: phiRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetPhiInference, - Tag: PresetPhiTagMap["Phi3Mini4kInstruct"], + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetPhiInference, + TorchRunParams: inference.DefaultAccelerateParams, + InferenceMainFile: inference.DefautTransformersMainFile, + ModelRunParams: phiRunParams, + }, + VLLM: model.VLLMParam{ + BaseCommand: inference.DefaultVLLMCommand, + ModelName: "phi-3-mini-4k-instruct", + ModelRunParams: phiRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + Tag: PresetPhiTagMap["Phi3Mini4kInstruct"], } } func (*phi3Mini4KInst) GetTuningParameters() *model.PresetParam { @@ -88,8 +101,12 @@ func (*phi3Mini4KInst) GetTuningParameters() *model.PresetParam { // TorchRunParams: inference.DefaultAccelerateParams, // ModelRunParams: phiRunParams, ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetPhiTuning, - Tag: PresetPhiTagMap["Phi3Mini4kInstruct"], + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetPhiTuning, + }, + }, + Tag: PresetPhiTagMap["Phi3Mini4kInstruct"], } } func (*phi3Mini4KInst) SupportDistributedInference() bool { return false } @@ -109,11 +126,21 @@ func (*phi3Mini128KInst) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "9Gi", PerGPUMemoryRequirement: "0Gi", // We run Phi using native vertical model parallel, no per GPU memory requirement. - TorchRunParams: inference.DefaultAccelerateParams, - ModelRunParams: phiRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetPhiInference, - Tag: PresetPhiTagMap["Phi3Mini128kInstruct"], + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetPhiInference, + TorchRunParams: inference.DefaultAccelerateParams, + InferenceMainFile: inference.DefautTransformersMainFile, + ModelRunParams: phiRunParams, + }, + VLLM: model.VLLMParam{ + BaseCommand: inference.DefaultVLLMCommand, + ModelName: "phi-3-mini-128k-instruct", + ModelRunParams: phiRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + Tag: PresetPhiTagMap["Phi3Mini128kInstruct"], } } func (*phi3Mini128KInst) GetTuningParameters() *model.PresetParam { @@ -124,11 +151,13 @@ func (*phi3Mini128KInst) GetTuningParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "72Gi", PerGPUMemoryRequirement: "72Gi", - // TorchRunParams: inference.DefaultAccelerateParams, - // ModelRunParams: phiRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetPhiTuning, - Tag: PresetPhiTagMap["Phi3Mini128kInstruct"], + ReadinessTimeout: time.Duration(30) * time.Minute, + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetPhiTuning, + }, + }, + Tag: PresetPhiTagMap["Phi3Mini128kInstruct"], } } func (*phi3Mini128KInst) SupportDistributedInference() bool { return false } @@ -148,11 +177,21 @@ func (*phi3_5MiniInst) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "8Gi", PerGPUMemoryRequirement: "0Gi", // We run Phi using native vertical model parallel, no per GPU memory requirement. - TorchRunParams: inference.DefaultAccelerateParams, - ModelRunParams: phiRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetPhiInference, - Tag: PresetPhiTagMap["Phi3_5MiniInstruct"], + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetPhiInference, + TorchRunParams: inference.DefaultAccelerateParams, + InferenceMainFile: inference.DefautTransformersMainFile, + ModelRunParams: phiRunParams, + }, + VLLM: model.VLLMParam{ + BaseCommand: inference.DefaultVLLMCommand, + ModelName: "phi-3.5-mini-instruct", + ModelRunParams: phiRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + Tag: PresetPhiTagMap["Phi3_5MiniInstruct"], } } func (*phi3_5MiniInst) GetTuningParameters() *model.PresetParam { @@ -166,8 +205,12 @@ func (*phi3_5MiniInst) GetTuningParameters() *model.PresetParam { // TorchRunParams: inference.DefaultAccelerateParams, // ModelRunParams: phiRunParams, ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetPhiTuning, - Tag: PresetPhiTagMap["Phi3_5MiniInstruct"], + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetPhiTuning, + }, + }, + Tag: PresetPhiTagMap["Phi3_5MiniInstruct"], } } func (*phi3_5MiniInst) SupportDistributedInference() bool { return false } @@ -187,11 +230,21 @@ func (*Phi3Medium4kInstruct) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "28Gi", PerGPUMemoryRequirement: "0Gi", // We run Phi using native vertical model parallel, no per GPU memory requirement. - TorchRunParams: inference.DefaultAccelerateParams, - ModelRunParams: phiRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetPhiInference, - Tag: PresetPhiTagMap["Phi3Medium4kInstruct"], + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetPhiInference, + TorchRunParams: inference.DefaultAccelerateParams, + InferenceMainFile: inference.DefautTransformersMainFile, + ModelRunParams: phiRunParams, + }, + VLLM: model.VLLMParam{ + BaseCommand: inference.DefaultVLLMCommand, + ModelName: "phi-3-medium-4k-instruct", + ModelRunParams: phiRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + Tag: PresetPhiTagMap["Phi3Medium4kInstruct"], } } func (*Phi3Medium4kInstruct) GetTuningParameters() *model.PresetParam { @@ -205,8 +258,12 @@ func (*Phi3Medium4kInstruct) GetTuningParameters() *model.PresetParam { // TorchRunParams: inference.DefaultAccelerateParams, // ModelRunParams: phiRunParams, ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetPhiTuning, - Tag: PresetPhiTagMap["Phi3Medium4kInstruct"], + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetPhiTuning, + }, + }, + Tag: PresetPhiTagMap["Phi3Medium4kInstruct"], } } func (*Phi3Medium4kInstruct) SupportDistributedInference() bool { return false } @@ -226,11 +283,21 @@ func (*Phi3Medium128kInstruct) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "28Gi", PerGPUMemoryRequirement: "0Gi", // We run Phi using native vertical model parallel, no per GPU memory requirement. - TorchRunParams: inference.DefaultAccelerateParams, - ModelRunParams: phiRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetPhiInference, - Tag: PresetPhiTagMap["Phi3Medium128kInstruct"], + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetPhiInference, + TorchRunParams: inference.DefaultAccelerateParams, + InferenceMainFile: inference.DefautTransformersMainFile, + ModelRunParams: phiRunParams, + }, + VLLM: model.VLLMParam{ + BaseCommand: inference.DefaultVLLMCommand, + ModelName: "phi-3-medium-128k-instruct", + ModelRunParams: phiRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + Tag: PresetPhiTagMap["Phi3Medium128kInstruct"], } } func (*Phi3Medium128kInstruct) GetTuningParameters() *model.PresetParam { @@ -241,11 +308,13 @@ func (*Phi3Medium128kInstruct) GetTuningParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "80Gi", PerGPUMemoryRequirement: "80Gi", - // TorchRunParams: inference.DefaultAccelerateParams, - // ModelRunParams: phiRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetPhiTuning, - Tag: PresetPhiTagMap["Phi3Medium128kInstruct"], + ReadinessTimeout: time.Duration(30) * time.Minute, + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetPhiTuning, + }, + }, + Tag: PresetPhiTagMap["Phi3Medium128kInstruct"], } } func (*Phi3Medium128kInstruct) SupportDistributedInference() bool { return false } diff --git a/test/e2e/preset_vllm_test.go b/test/e2e/preset_vllm_test.go new file mode 100644 index 000000000..0c3c5eb73 --- /dev/null +++ b/test/e2e/preset_vllm_test.go @@ -0,0 +1,195 @@ +package e2e + +import ( + "fmt" + "math/rand" + "time" + + . "github.com/onsi/ginkgo/v2" + + kaitov1alpha1 "github.com/kaito-project/kaito/api/v1alpha1" + "github.com/kaito-project/kaito/test/e2e/utils" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +var _ = Describe("Workspace Preset on vllm runtime", func() { + BeforeEach(func() { + loadTestEnvVars() + loadModelVersions() + }) + + AfterEach(func() { + if CurrentSpecReport().Failed() { + utils.PrintPodLogsOnFailure(namespaceName, "") // The Preset Pod + utils.PrintPodLogsOnFailure("kaito-workspace", "") // The Kaito Workspace Pod + utils.PrintPodLogsOnFailure("gpu-provisioner", "") // The gpu-provisioner Pod + Fail("Fail threshold reached") + } + }) + + It("should create a falcon workspace with preset public mode successfully", func() { + numOfNode := 1 + workspaceObj := createFalconWorkspaceWithPresetPublicModeAndVLLM(numOfNode) + + defer cleanupResources(workspaceObj) + time.Sleep(30 * time.Second) + + validateCreateNode(workspaceObj, numOfNode) + validateResourceStatus(workspaceObj) + + time.Sleep(30 * time.Second) + + validateAssociatedService(workspaceObj) + + validateInferenceResource(workspaceObj, int32(numOfNode), false) + + validateWorkspaceReadiness(workspaceObj) + }) + + It("should create a mistral workspace with preset public mode successfully", func() { + numOfNode := 1 + workspaceObj := createMistralWorkspaceWithPresetPublicModeAndVLLM(numOfNode) + + defer cleanupResources(workspaceObj) + time.Sleep(30 * time.Second) + + validateCreateNode(workspaceObj, numOfNode) + validateResourceStatus(workspaceObj) + + time.Sleep(30 * time.Second) + + validateAssociatedService(workspaceObj) + + validateInferenceResource(workspaceObj, int32(numOfNode), false) + + validateWorkspaceReadiness(workspaceObj) + }) + + It("should create a Phi-2 workspace with preset public mode successfully", func() { + numOfNode := 1 + workspaceObj := createPhi2WorkspaceWithPresetPublicModeAndVLLM(numOfNode) + + defer cleanupResources(workspaceObj) + time.Sleep(30 * time.Second) + + validateCreateNode(workspaceObj, numOfNode) + validateResourceStatus(workspaceObj) + + time.Sleep(30 * time.Second) + + validateAssociatedService(workspaceObj) + + validateInferenceResource(workspaceObj, int32(numOfNode), false) + + validateWorkspaceReadiness(workspaceObj) + }) + + It("should create a Phi-3-mini-128k-instruct workspace with preset public mode successfully", func() { + numOfNode := 1 + workspaceObj := createPhi3WorkspaceWithPresetPublicModeAndVLLM(numOfNode) + + defer cleanupResources(workspaceObj) + time.Sleep(30 * time.Second) + + validateCreateNode(workspaceObj, numOfNode) + validateResourceStatus(workspaceObj) + + time.Sleep(30 * time.Second) + + validateAssociatedService(workspaceObj) + + validateInferenceResource(workspaceObj, int32(numOfNode), false) + + validateWorkspaceReadiness(workspaceObj) + }) + + It("should create a mistral workspace with preset public mode and 2 gpu successfully", func() { + // single node with 2 gpu + numOfNode := 1 + workspaceObj := createMistralWorkspaceWithPresetPublicModeAndVLLMAndMultiGPU(numOfNode) + + defer cleanupResources(workspaceObj) + time.Sleep(30 * time.Second) + + validateCreateNode(workspaceObj, numOfNode) + validateResourceStatus(workspaceObj) + + time.Sleep(30 * time.Second) + + validateAssociatedService(workspaceObj) + + validateInferenceResource(workspaceObj, int32(numOfNode), false) + + validateWorkspaceReadiness(workspaceObj) + }) +}) + +func createFalconWorkspaceWithPresetPublicModeAndVLLM(numOfNode int) *kaitov1alpha1.Workspace { + workspaceObj := &kaitov1alpha1.Workspace{} + By("Creating a workspace CR with Falcon 7B preset public mode and vLLM", func() { + uniqueID := fmt.Sprint("preset-falcon-", rand.Intn(1000)) + workspaceObj = utils.GenerateInferenceWorkspaceManifestWithVLLM(uniqueID, namespaceName, "", numOfNode, "Standard_NC6s_v3", + &metav1.LabelSelector{ + MatchLabels: map[string]string{"kaito-workspace": "public-preset-e2e-test-falcon-vllm"}, + }, nil, PresetFalcon7BModel, kaitov1alpha1.ModelImageAccessModePublic, nil, nil, nil) + + createAndValidateWorkspace(workspaceObj) + }) + return workspaceObj +} + +func createMistralWorkspaceWithPresetPublicModeAndVLLM(numOfNode int) *kaitov1alpha1.Workspace { + workspaceObj := &kaitov1alpha1.Workspace{} + By("Creating a workspace CR with Mistral 7B preset public mode and vLLM", func() { + uniqueID := fmt.Sprint("preset-mistral-", rand.Intn(1000)) + workspaceObj = utils.GenerateInferenceWorkspaceManifestWithVLLM(uniqueID, namespaceName, "", numOfNode, "Standard_NC6s_v3", + &metav1.LabelSelector{ + MatchLabels: map[string]string{"kaito-workspace": "public-preset-e2e-test-mistral-vllm"}, + }, nil, PresetMistral7BInstructModel, kaitov1alpha1.ModelImageAccessModePublic, nil, nil, nil) + + createAndValidateWorkspace(workspaceObj) + }) + return workspaceObj +} + +func createPhi2WorkspaceWithPresetPublicModeAndVLLM(numOfNode int) *kaitov1alpha1.Workspace { + workspaceObj := &kaitov1alpha1.Workspace{} + By("Creating a workspace CR with Phi 2 preset public mode and vLLM", func() { + uniqueID := fmt.Sprint("preset-phi2-", rand.Intn(1000)) + workspaceObj = utils.GenerateInferenceWorkspaceManifestWithVLLM(uniqueID, namespaceName, "", numOfNode, "Standard_NC6s_v3", + &metav1.LabelSelector{ + MatchLabels: map[string]string{"kaito-workspace": "public-preset-e2e-test-phi-2-vllm"}, + }, nil, PresetPhi2Model, kaitov1alpha1.ModelImageAccessModePublic, nil, nil, nil) + + createAndValidateWorkspace(workspaceObj) + }) + return workspaceObj +} + +func createPhi3WorkspaceWithPresetPublicModeAndVLLM(numOfNode int) *kaitov1alpha1.Workspace { + workspaceObj := &kaitov1alpha1.Workspace{} + By("Creating a workspace CR with Phi-3-mini-128k-instruct preset public mode and vLLM", func() { + uniqueID := fmt.Sprint("preset-phi3-", rand.Intn(1000)) + workspaceObj = utils.GenerateInferenceWorkspaceManifestWithVLLM(uniqueID, namespaceName, "", numOfNode, "Standard_NC6s_v3", + &metav1.LabelSelector{ + MatchLabels: map[string]string{"kaito-workspace": "public-preset-e2e-test-phi-3-mini-128k-instruct-vllm"}, + }, nil, PresetPhi3Mini128kModel, kaitov1alpha1.ModelImageAccessModePublic, nil, nil, nil) + + createAndValidateWorkspace(workspaceObj) + }) + return workspaceObj +} + +func createMistralWorkspaceWithPresetPublicModeAndVLLMAndMultiGPU(numOfNode int) *kaitov1alpha1.Workspace { + workspaceObj := &kaitov1alpha1.Workspace{} + By("Creating a workspace CR with Mistral 7B preset public mode and vLLM", func() { + uniqueID := fmt.Sprint("preset-mistral-2gpu-", rand.Intn(1000)) + workspaceObj = utils.GenerateInferenceWorkspaceManifestWithVLLM(uniqueID, namespaceName, "", numOfNode, "Standard_NC12s_v3", + &metav1.LabelSelector{ + MatchLabels: map[string]string{"kaito-workspace": "public-preset-e2e-test-mistral-2gpu-vllm"}, + }, nil, PresetMistral7BInstructModel, kaitov1alpha1.ModelImageAccessModePublic, nil, nil, nil) + + createAndValidateWorkspace(workspaceObj) + }) + return workspaceObj +} diff --git a/test/e2e/utils/utils.go b/test/e2e/utils/utils.go index 48b18e890..100c3fc01 100644 --- a/test/e2e/utils/utils.go +++ b/test/e2e/utils/utils.go @@ -21,6 +21,7 @@ import ( "k8s.io/client-go/tools/clientcmd" kaitov1alpha1 "github.com/kaito-project/kaito/api/v1alpha1" + "github.com/kaito-project/kaito/pkg/model" "github.com/samber/lo" "gopkg.in/yaml.v2" corev1 "k8s.io/api/core/v1" @@ -262,6 +263,20 @@ func GenerateInferenceWorkspaceManifest(name, namespace, imageName string, resou return workspace } +func GenerateInferenceWorkspaceManifestWithVLLM(name, namespace, imageName string, resourceCount int, instanceType string, + labelSelector *metav1.LabelSelector, preferredNodes []string, presetName kaitov1alpha1.ModelName, + accessMode kaitov1alpha1.ModelImageAccessMode, imagePullSecret []string, + podTemplate *corev1.PodTemplateSpec, adapters []kaitov1alpha1.AdapterSpec) *kaitov1alpha1.Workspace { + workspace := GenerateInferenceWorkspaceManifest(name, namespace, imageName, resourceCount, instanceType, + labelSelector, preferredNodes, presetName, accessMode, imagePullSecret, podTemplate, adapters) + + if workspace.Annotations == nil { + workspace.Annotations = make(map[string]string) + } + workspace.Annotations[kaitov1alpha1.AnnotationWorkspaceRuntime] = string(model.RuntimeNameVLLM) + return workspace +} + func GenerateTuningWorkspaceManifest(name, namespace, imageName string, resourceCount int, instanceType string, labelSelector *metav1.LabelSelector, preferredNodes []string, input *kaitov1alpha1.DataSource, output *kaitov1alpha1.DataDestination, preset *kaitov1alpha1.PresetSpec, method kaitov1alpha1.TuningMethod) *kaitov1alpha1.Workspace { From 2c7eae50414c2cdc06bacbf2cb332d6de851b5f4 Mon Sep 17 00:00:00 2001 From: jerryzhuang Date: Mon, 2 Dec 2024 19:15:02 +1100 Subject: [PATCH 2/3] set vllm as default Signed-off-by: jerryzhuang --- Makefile | 2 +- api/v1alpha1/labels.go | 7 ++++--- pkg/featuregates/featuregates.go | 2 +- pkg/utils/common-preset.go | 5 +---- pkg/utils/test/testUtils.go | 9 ++++++--- test/e2e/utils/utils.go | 4 ++++ 6 files changed, 17 insertions(+), 12 deletions(-) diff --git a/Makefile b/Makefile index 99ca81f7f..5e0a03ef2 100644 --- a/Makefile +++ b/Makefile @@ -127,7 +127,7 @@ GINKGO_SKIP ?= GINKGO_NODES ?= 2 GINKGO_NO_COLOR ?= false GINKGO_TIMEOUT ?= 120m -GINKGO_ARGS ?= -focus="$(GINKGO_FOCUS)" -skip="$(GINKGO_SKIP)" -nodes=$(GINKGO_NODES) -no-color=$(GINKGO_NO_COLOR) --output-interceptor-mode=none -timeout=$(GINKGO_TIMEOUT) +GINKGO_ARGS ?= -focus="$(GINKGO_FOCUS)" -skip="$(GINKGO_SKIP)" -nodes=$(GINKGO_NODES) -no-color=$(GINKGO_NO_COLOR) --output-interceptor-mode=none -timeout=$(GINKGO_TIMEOUT) --fail-fast $(E2E_TEST): (cd test/e2e && go test -c . -o $(E2E_TEST)) diff --git a/api/v1alpha1/labels.go b/api/v1alpha1/labels.go index e66c8d427..99b9a549a 100644 --- a/api/v1alpha1/labels.go +++ b/api/v1alpha1/labels.go @@ -46,11 +46,12 @@ func GetWorkspaceRuntimeName(ws *Workspace) model.RuntimeName { if ws == nil { panic("workspace is nil") } - runtime := model.RuntimeNameHuggingfaceTransformers - if featuregates.FeatureGates[consts.FeatureFlagVLLM] { - runtime = model.RuntimeNameVLLM + + if !featuregates.FeatureGates[consts.FeatureFlagVLLM] { + return model.RuntimeNameHuggingfaceTransformers } + runtime := model.RuntimeNameVLLM name := ws.Annotations[AnnotationWorkspaceRuntime] switch name { case string(model.RuntimeNameHuggingfaceTransformers): diff --git a/pkg/featuregates/featuregates.go b/pkg/featuregates/featuregates.go index 3fea1b2f5..2b4a9942f 100644 --- a/pkg/featuregates/featuregates.go +++ b/pkg/featuregates/featuregates.go @@ -15,7 +15,7 @@ var ( // FeatureGates is a map that holds the feature gates and their default values for Kaito. FeatureGates = map[string]bool{ consts.FeatureFlagKarpenter: false, - consts.FeatureFlagVLLM: false, + consts.FeatureFlagVLLM: true, // Add more feature gates here } ) diff --git a/pkg/utils/common-preset.go b/pkg/utils/common-preset.go index 5af7936a8..c9f5f8dd0 100644 --- a/pkg/utils/common-preset.go +++ b/pkg/utils/common-preset.go @@ -4,7 +4,6 @@ package utils import ( corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/resource" ) const ( @@ -66,14 +65,12 @@ func ConfigSHMVolume(instanceCount int) (corev1.Volume, corev1.VolumeMount) { // Signifies multinode inference requirement if instanceCount > 1 { - size := resource.MustParse("4Gi") // Append share memory volume to any existing volumes volume = corev1.Volume{ Name: "dshm", VolumeSource: corev1.VolumeSource{ EmptyDir: &corev1.EmptyDirVolumeSource{ - Medium: "Memory", - SizeLimit: &size, + Medium: "Memory", }, }, } diff --git a/pkg/utils/test/testUtils.go b/pkg/utils/test/testUtils.go index b0f728f6f..881080422 100644 --- a/pkg/utils/test/testUtils.go +++ b/pkg/utils/test/testUtils.go @@ -30,6 +30,9 @@ var ( ObjectMeta: metav1.ObjectMeta{ Name: "testWorkspace", Namespace: "kaito", + Annotations: map[string]string{ + v1alpha1.AnnotationWorkspaceRuntime: string(model.RuntimeNameHuggingfaceTransformers), + }, }, Resource: v1alpha1.ResourceSpec{ Count: &gpuNodeCount, @@ -122,6 +125,9 @@ var ( ObjectMeta: metav1.ObjectMeta{ Name: "testWorkspace", Namespace: "kaito", + Annotations: map[string]string{ + v1alpha1.AnnotationWorkspaceRuntime: string(model.RuntimeNameHuggingfaceTransformers), + }, }, Resource: v1alpha1.ResourceSpec{ Count: &gpuNodeCount, @@ -144,9 +150,6 @@ var ( ObjectMeta: metav1.ObjectMeta{ Name: "testWorkspace", Namespace: "kaito", - Annotations: map[string]string{ - v1alpha1.AnnotationWorkspaceRuntime: string(model.RuntimeNameVLLM), - }, }, Resource: v1alpha1.ResourceSpec{ Count: &gpuNodeCount, diff --git a/test/e2e/utils/utils.go b/test/e2e/utils/utils.go index 100c3fc01..9beb2d18e 100644 --- a/test/e2e/utils/utils.go +++ b/test/e2e/utils/utils.go @@ -20,6 +20,7 @@ import ( "k8s.io/client-go/rest" "k8s.io/client-go/tools/clientcmd" + "github.com/kaito-project/kaito/api/v1alpha1" kaitov1alpha1 "github.com/kaito-project/kaito/api/v1alpha1" "github.com/kaito-project/kaito/pkg/model" "github.com/samber/lo" @@ -285,6 +286,9 @@ func GenerateTuningWorkspaceManifest(name, namespace, imageName string, resource ObjectMeta: metav1.ObjectMeta{ Name: name, Namespace: namespace, + Annotations: map[string]string{ + v1alpha1.AnnotationWorkspaceRuntime: string(model.RuntimeNameHuggingfaceTransformers), + }, }, Resource: kaitov1alpha1.ResourceSpec{ Count: lo.ToPtr(resourceCount), From e9f5b030dda0392481715328379ce84713b87208 Mon Sep 17 00:00:00 2001 From: jerryzhuang Date: Mon, 2 Dec 2024 19:40:16 +1100 Subject: [PATCH 3/3] fix Signed-off-by: jerryzhuang --- test/e2e/utils/utils.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/e2e/utils/utils.go b/test/e2e/utils/utils.go index 9beb2d18e..3cdf5720c 100644 --- a/test/e2e/utils/utils.go +++ b/test/e2e/utils/utils.go @@ -227,6 +227,9 @@ func GenerateInferenceWorkspaceManifest(name, namespace, imageName string, resou ObjectMeta: metav1.ObjectMeta{ Name: name, Namespace: namespace, + Annotations: map[string]string{ + v1alpha1.AnnotationWorkspaceRuntime: string(model.RuntimeNameHuggingfaceTransformers), + }, }, Resource: kaitov1alpha1.ResourceSpec{ Count: lo.ToPtr(resourceCount), @@ -286,9 +289,6 @@ func GenerateTuningWorkspaceManifest(name, namespace, imageName string, resource ObjectMeta: metav1.ObjectMeta{ Name: name, Namespace: namespace, - Annotations: map[string]string{ - v1alpha1.AnnotationWorkspaceRuntime: string(model.RuntimeNameHuggingfaceTransformers), - }, }, Resource: kaitov1alpha1.ResourceSpec{ Count: lo.ToPtr(resourceCount),