From 8244ebc27370aef3121c8885852bfceef48ba724 Mon Sep 17 00:00:00 2001 From: jerryzhuang Date: Thu, 17 Oct 2024 17:42:13 +1100 Subject: [PATCH] feat: support vllm in controller - set vllm as the default runtime Signed-off-by: jerryzhuang --- api/v1alpha1/labels.go | 30 ++++ api/v1alpha1/workspace_validation.go | 4 +- pkg/featuregates/featuregates.go | 1 + pkg/model/interface.go | 145 ++++++++++++++-- pkg/utils/common-preset.go | 5 - pkg/utils/common.go | 14 +- pkg/utils/common_test.go | 2 +- pkg/utils/consts/consts.go | 5 +- pkg/utils/plugin/plugin.go | 4 + pkg/utils/test/testModel.go | 22 ++- pkg/utils/test/testUtils.go | 26 +++ .../controllers/workspace_controller.go | 2 +- pkg/workspace/inference/preset-inferences.go | 87 +++++----- .../inference/preset-inferences_test.go | 56 +++++-- pkg/workspace/tuning/preset-tuning.go | 12 +- pkg/workspace/tuning/preset-tuning_test.go | 10 +- presets/models/falcon/model.go | 116 +++++++++---- presets/models/llama2/model.go | 64 ++++--- presets/models/llama2chat/model.go | 64 +++++-- presets/models/mistral/model.go | 63 +++++-- presets/models/phi2/model.go | 36 ++-- presets/models/phi3/model.go | 156 ++++++++++++------ 22 files changed, 675 insertions(+), 249 deletions(-) diff --git a/api/v1alpha1/labels.go b/api/v1alpha1/labels.go index 7e807c29a..e66c8d427 100644 --- a/api/v1alpha1/labels.go +++ b/api/v1alpha1/labels.go @@ -3,6 +3,12 @@ package v1alpha1 +import ( + "github.com/kaito-project/kaito/pkg/featuregates" + "github.com/kaito-project/kaito/pkg/model" + "github.com/kaito-project/kaito/pkg/utils/consts" +) + const ( // Non-prefixed labels/annotations are reserved for end-use. @@ -30,4 +36,28 @@ const ( // RAGEngineRevisionAnnotation is the Annotations for revision number RAGEngineRevisionAnnotation = "ragengine.kaito.io/revision" + + // AnnotationWorkspaceRuntime is the annotation for runtime selection. + AnnotationWorkspaceRuntime = KAITOPrefix + "runtime" ) + +// GetWorkspaceRuntimeName returns the runtime name of the workspace. +func GetWorkspaceRuntimeName(ws *Workspace) model.RuntimeName { + if ws == nil { + panic("workspace is nil") + } + runtime := model.RuntimeNameHuggingfaceTransformers + if featuregates.FeatureGates[consts.FeatureFlagVLLM] { + runtime = model.RuntimeNameVLLM + } + + name := ws.Annotations[AnnotationWorkspaceRuntime] + switch name { + case string(model.RuntimeNameHuggingfaceTransformers): + runtime = model.RuntimeNameHuggingfaceTransformers + case string(model.RuntimeNameVLLM): + runtime = model.RuntimeNameVLLM + } + + return runtime +} diff --git a/api/v1alpha1/workspace_validation.go b/api/v1alpha1/workspace_validation.go index 9a4183c69..703322c26 100644 --- a/api/v1alpha1/workspace_validation.go +++ b/api/v1alpha1/workspace_validation.go @@ -169,7 +169,7 @@ func (r *TuningSpec) validateCreate(ctx context.Context, workspaceNamespace stri // Currently require a preset to specified, in future we can consider defining a template if r.Preset == nil { errs = errs.Also(apis.ErrMissingField("Preset")) - } else if presetName := string(r.Preset.Name); !utils.IsValidPreset(presetName) { + } else if presetName := string(r.Preset.Name); !plugin.IsValidPreset(presetName) { errs = errs.Also(apis.ErrInvalidValue(fmt.Sprintf("Unsupported tuning preset name %s", presetName), "presetName")) } return errs @@ -407,7 +407,7 @@ func (i *InferenceSpec) validateCreate() (errs *apis.FieldError) { if i.Preset != nil { presetName := string(i.Preset.Name) // Validate preset name - if !utils.IsValidPreset(presetName) { + if !plugin.IsValidPreset(presetName) { errs = errs.Also(apis.ErrInvalidValue(fmt.Sprintf("Unsupported inference preset name %s", presetName), "presetName")) } // Validate private preset has private image specified diff --git a/pkg/featuregates/featuregates.go b/pkg/featuregates/featuregates.go index 2161210e1..3fea1b2f5 100644 --- a/pkg/featuregates/featuregates.go +++ b/pkg/featuregates/featuregates.go @@ -15,6 +15,7 @@ var ( // FeatureGates is a map that holds the feature gates and their default values for Kaito. FeatureGates = map[string]bool{ consts.FeatureFlagKarpenter: false, + consts.FeatureFlagVLLM: false, // Add more feature gates here } ) diff --git a/pkg/model/interface.go b/pkg/model/interface.go index 56c925698..c50c144a4 100644 --- a/pkg/model/interface.go +++ b/pkg/model/interface.go @@ -4,6 +4,8 @@ package model import ( "time" + + "github.com/kaito-project/kaito/pkg/utils" ) type Model interface { @@ -13,23 +15,140 @@ type Model interface { SupportTuning() bool } +// RuntimeName is LLM runtime name. +type RuntimeName string + +const ( + RuntimeNameHuggingfaceTransformers RuntimeName = "transformers" + RuntimeNameVLLM RuntimeName = "vllm" + + InferenceFileHuggingface = "/workspace/tfs/inference_api.py" + InferenceFileVLLM = "/workspace/vllm/inference_api.py" +) + // PresetParam defines the preset inference parameters for a model. type PresetParam struct { - ModelFamilyName string // The name of the model family. - ImageAccessMode string // Defines where the Image is Public or Private. - DiskStorageRequirement string // Disk storage requirements for the model. - GPUCountRequirement string // Number of GPUs required for the Preset. Used for inference. - TotalGPUMemoryRequirement string // Total GPU memory required for the Preset. Used for inference. - PerGPUMemoryRequirement string // GPU memory required per GPU. Used for inference. - TuningPerGPUMemoryRequirement map[string]int // Min GPU memory per tuning method (batch size 1). Used for tuning. - TorchRunParams map[string]string // Parameters for configuring the torchrun command. - TorchRunRdzvParams map[string]string // Optional rendezvous parameters for distributed training/inference using torchrun (elastic). - BaseCommand string // The initial command (e.g., 'torchrun', 'accelerate launch') used in the command line. - ModelRunParams map[string]string // Parameters for running the model training/inference. + Tag string // The model image tag + ModelFamilyName string // The name of the model family. + ImageAccessMode string // Defines where the Image is Public or Private. + + DiskStorageRequirement string // Disk storage requirements for the model. + GPUCountRequirement string // Number of GPUs required for the Preset. Used for inference. + TotalGPUMemoryRequirement string // Total GPU memory required for the Preset. Used for inference. + PerGPUMemoryRequirement string // GPU memory required per GPU. Used for inference. + TuningPerGPUMemoryRequirement map[string]int // Min GPU memory per tuning method (batch size 1). Used for tuning. + WorldSize int // Defines the number of processes required for distributed inference. + + RuntimeParam + // ReadinessTimeout defines the maximum duration for creating the workload. // This timeout accommodates the size of the image, ensuring pull completion // even under slower network conditions or unforeseen delays. ReadinessTimeout time.Duration - WorldSize int // Defines the number of processes required for distributed inference. - Tag string // The model image tag +} + +// RuntimeParam defines the llm runtime parameters. +type RuntimeParam struct { + Transformers HuggingfaceTransformersParam + VLLM VLLMParam +} + +type HuggingfaceTransformersParam struct { + BaseCommand string // The initial command (e.g., 'torchrun', 'accelerate launch') used in the command line. + TorchRunParams map[string]string // Parameters for configuring the torchrun command. + TorchRunRdzvParams map[string]string // Optional rendezvous parameters for distributed training/inference using torchrun (elastic). + ModelRunParams map[string]string // Parameters for running the model training/inference. +} + +type VLLMParam struct { + BaseCommand string + // The model name used in the openai serving API. + // see https://platform.openai.com/docs/api-reference/chat/create#chat-create-model. + ModelName string + // Parameters for distributed inference. + DistributionParams map[string]string + // Parameters for running the model training/inference. + ModelRunParams map[string]string +} + +func (p *PresetParam) DeepCopy() *PresetParam { + if p == nil { + return nil + } + out := new(PresetParam) + *out = *p + out.RuntimeParam = p.RuntimeParam.DeepCopy() + out.TuningPerGPUMemoryRequirement = make(map[string]int, len(p.TuningPerGPUMemoryRequirement)) + for k, v := range p.TuningPerGPUMemoryRequirement { + out.TuningPerGPUMemoryRequirement[k] = v + } + return out +} + +func (rp *RuntimeParam) DeepCopy() RuntimeParam { + if rp == nil { + return RuntimeParam{} + } + out := RuntimeParam{} + out.Transformers = rp.Transformers.DeepCopy() + out.VLLM = rp.VLLM.DeepCopy() + return out +} + +func (h *HuggingfaceTransformersParam) DeepCopy() HuggingfaceTransformersParam { + if h == nil { + return HuggingfaceTransformersParam{} + } + out := HuggingfaceTransformersParam{} + out.BaseCommand = h.BaseCommand + out.TorchRunParams = make(map[string]string, len(h.TorchRunParams)) + for k, v := range h.TorchRunParams { + out.TorchRunParams[k] = v + } + out.TorchRunRdzvParams = make(map[string]string, len(h.TorchRunRdzvParams)) + for k, v := range h.TorchRunRdzvParams { + out.TorchRunRdzvParams[k] = v + } + out.ModelRunParams = make(map[string]string, len(h.ModelRunParams)) + for k, v := range h.ModelRunParams { + out.ModelRunParams[k] = v + } + return out +} + +func (v *VLLMParam) DeepCopy() VLLMParam { + if v == nil { + return VLLMParam{} + } + out := VLLMParam{} + out.BaseCommand = v.BaseCommand + out.DistributionParams = make(map[string]string, len(v.DistributionParams)) + for k, v := range v.DistributionParams { + out.DistributionParams[k] = v + } + out.ModelRunParams = make(map[string]string, len(v.ModelRunParams)) + for k, v := range v.ModelRunParams { + out.ModelRunParams[k] = v + } + return out +} + +// builds the container command: +// eg. torchrun baseCommand +func (p *PresetParam) GetInferenceCommand(runtime RuntimeName, skuNumGPUs string) []string { + switch runtime { + case RuntimeNameHuggingfaceTransformers: + torchCommand := utils.BuildCmdStr(p.Transformers.BaseCommand, p.Transformers.TorchRunParams, p.Transformers.TorchRunRdzvParams) + modelCommand := utils.BuildCmdStr(InferenceFileHuggingface, p.Transformers.ModelRunParams) + return utils.ShellCmd(torchCommand + " " + modelCommand) + case RuntimeNameVLLM: + if p.VLLM.ModelName != "" { + p.VLLM.ModelRunParams["served-model-name"] = p.VLLM.ModelName + } + p.VLLM.ModelRunParams["tensor-parallel-size"] = skuNumGPUs + modelCommand := utils.BuildCmdStr(InferenceFileVLLM, p.VLLM.ModelRunParams) + return utils.ShellCmd(p.VLLM.BaseCommand + " " + modelCommand) + default: + return nil + } } diff --git a/pkg/utils/common-preset.go b/pkg/utils/common-preset.go index 3a43f6d0d..c9f5f8dd0 100644 --- a/pkg/utils/common-preset.go +++ b/pkg/utils/common-preset.go @@ -3,7 +3,6 @@ package utils import ( - "github.com/kaito-project/kaito/pkg/utils/plugin" corev1 "k8s.io/api/core/v1" ) @@ -150,7 +149,3 @@ func ConfigAdapterVolume() (corev1.Volume, corev1.VolumeMount) { } return volume, volumeMount } - -func IsValidPreset(preset string) bool { - return plugin.KaitoModelRegister.Has(preset) -} diff --git a/pkg/utils/common.go b/pkg/utils/common.go index af0c7e4cf..cda6e7d9a 100644 --- a/pkg/utils/common.go +++ b/pkg/utils/common.go @@ -68,13 +68,15 @@ func MergeConfigMaps(baseMap, overrideMap map[string]string) map[string]string { return merged } -func BuildCmdStr(baseCommand string, runParams map[string]string) string { +func BuildCmdStr(baseCommand string, runParams ...map[string]string) string { updatedBaseCommand := baseCommand - for key, value := range runParams { - if value == "" { - updatedBaseCommand = fmt.Sprintf("%s --%s", updatedBaseCommand, key) - } else { - updatedBaseCommand = fmt.Sprintf("%s --%s=%s", updatedBaseCommand, key, value) + for _, runParam := range runParams { + for key, value := range runParam { + if value == "" { + updatedBaseCommand = fmt.Sprintf("%s --%s", updatedBaseCommand, key) + } else { + updatedBaseCommand = fmt.Sprintf("%s --%s=%s", updatedBaseCommand, key, value) + } } } diff --git a/pkg/utils/common_test.go b/pkg/utils/common_test.go index e23997692..b214150e7 100644 --- a/pkg/utils/common_test.go +++ b/pkg/utils/common_test.go @@ -2,7 +2,6 @@ package utils import ( "context" - "sigs.k8s.io/controller-runtime/pkg/client" "testing" "github.com/stretchr/testify/assert" @@ -12,6 +11,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/kubernetes/scheme" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" ) diff --git a/pkg/utils/consts/consts.go b/pkg/utils/consts/consts.go index 6756302d5..bcb1b8c58 100644 --- a/pkg/utils/consts/consts.go +++ b/pkg/utils/consts/consts.go @@ -11,7 +11,6 @@ const ( // RAGEngineFinalizer is used to make sure that ragengine controller handles garbage collection. RAGEngineFinalizer = "ragengine.finalizer.kaito.sh" DefaultReleaseNamespaceEnvVar = "RELEASE_NAMESPACE" - FeatureFlagKarpenter = "Karpenter" AzureCloudName = "azure" AWSCloudName = "aws" GPUString = "gpu" @@ -20,6 +19,10 @@ const ( GiBToBytes = 1024 * 1024 * 1024 // Conversion factor from GiB to bytes NvidiaGPU = "nvidia.com/gpu" + // Feature flags + FeatureFlagKarpenter = "Karpenter" + FeatureFlagVLLM = "vLLM" + // Nodeclaim related consts KaitoNodePoolName = "kaito" LabelNodePool = "karpenter.sh/nodepool" diff --git a/pkg/utils/plugin/plugin.go b/pkg/utils/plugin/plugin.go index 35706cb9c..62f048265 100644 --- a/pkg/utils/plugin/plugin.go +++ b/pkg/utils/plugin/plugin.go @@ -60,3 +60,7 @@ func (reg *ModelRegister) Has(name string) bool { _, ok := reg.models[name] return ok } + +func IsValidPreset(preset string) bool { + return KaitoModelRegister.Has(preset) +} diff --git a/pkg/utils/test/testModel.go b/pkg/utils/test/testModel.go index 2a820191f..c5efa1293 100644 --- a/pkg/utils/test/testModel.go +++ b/pkg/utils/test/testModel.go @@ -15,8 +15,15 @@ type testModel struct{} func (*testModel) GetInferenceParameters() *model.PresetParam { return &model.PresetParam{ GPUCountRequirement: "1", - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: "python3", + RuntimeParam: model.RuntimeParam{ + VLLM: model.VLLMParam{ + BaseCommand: "python3", + }, + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: "accelerate launch", + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, } } func (*testModel) GetTuningParameters() *model.PresetParam { @@ -37,8 +44,15 @@ type testDistributedModel struct{} func (*testDistributedModel) GetInferenceParameters() *model.PresetParam { return &model.PresetParam{ GPUCountRequirement: "1", - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: "python3", + RuntimeParam: model.RuntimeParam{ + VLLM: model.VLLMParam{ + BaseCommand: "python3", + }, + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: "accelerate launch", + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, } } func (*testDistributedModel) GetTuningParameters() *model.PresetParam { diff --git a/pkg/utils/test/testUtils.go b/pkg/utils/test/testUtils.go index 912cd2bc0..b0f728f6f 100644 --- a/pkg/utils/test/testUtils.go +++ b/pkg/utils/test/testUtils.go @@ -6,6 +6,7 @@ package test import ( "github.com/aws/karpenter-core/pkg/apis/v1alpha5" "github.com/kaito-project/kaito/api/v1alpha1" + "github.com/kaito-project/kaito/pkg/model" "github.com/samber/lo" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" @@ -139,6 +140,31 @@ var ( }, }, } + MockWorkspaceWithPresetVLLM = &v1alpha1.Workspace{ + ObjectMeta: metav1.ObjectMeta{ + Name: "testWorkspace", + Namespace: "kaito", + Annotations: map[string]string{ + v1alpha1.AnnotationWorkspaceRuntime: string(model.RuntimeNameVLLM), + }, + }, + Resource: v1alpha1.ResourceSpec{ + Count: &gpuNodeCount, + InstanceType: "Standard_NC12s_v3", + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "apps": "test", + }, + }, + }, + Inference: &v1alpha1.InferenceSpec{ + Preset: &v1alpha1.PresetSpec{ + PresetMeta: v1alpha1.PresetMeta{ + Name: "test-model", + }, + }, + }, + } ) var MockWorkspaceWithPresetHash = "89ae127050ec264a5ce84db48ef7226574cdf1299e6bd27fe90b927e34cc8adb" diff --git a/pkg/workspace/controllers/workspace_controller.go b/pkg/workspace/controllers/workspace_controller.go index d5d9eee52..7762a582a 100644 --- a/pkg/workspace/controllers/workspace_controller.go +++ b/pkg/workspace/controllers/workspace_controller.go @@ -780,7 +780,7 @@ func (c *WorkspaceReconciler) applyInference(ctx context.Context, wObj *kaitov1a } else if apierrors.IsNotFound(err) { var workloadObj client.Object // Need to create a new workload - workloadObj, err = inference.CreatePresetInference(ctx, wObj, revisionStr, inferenceParam, model.SupportDistributedInference(), c.Client) + workloadObj, err = inference.CreatePresetInference(ctx, wObj, revisionStr, model, c.Client) if err != nil { return } diff --git a/pkg/workspace/inference/preset-inferences.go b/pkg/workspace/inference/preset-inferences.go index 1817f7374..f66e87179 100644 --- a/pkg/workspace/inference/preset-inferences.go +++ b/pkg/workspace/inference/preset-inferences.go @@ -11,6 +11,7 @@ import ( "github.com/kaito-project/kaito/pkg/utils" "github.com/kaito-project/kaito/pkg/utils/consts" + "github.com/kaito-project/kaito/api/v1alpha1" kaitov1alpha1 "github.com/kaito-project/kaito/api/v1alpha1" "github.com/kaito-project/kaito/pkg/model" "github.com/kaito-project/kaito/pkg/utils/resources" @@ -23,9 +24,8 @@ import ( ) const ( - ProbePath = "/healthz" - Port5000 = int32(5000) - InferenceFile = "inference_api.py" + ProbePath = "/health" + Port5000 = int32(5000) ) var ( @@ -71,7 +71,12 @@ var ( } ) -func updateTorchParamsForDistributedInference(ctx context.Context, kubeClient client.Client, wObj *kaitov1alpha1.Workspace, inferenceObj *model.PresetParam) error { +func updateTorchParamsForDistributedInference(ctx context.Context, kubeClient client.Client, wObj *kaitov1alpha1.Workspace, inferenceParam *model.PresetParam) error { + runtimeName := v1alpha1.GetWorkspaceRuntimeName(wObj) + if runtimeName != model.RuntimeNameHuggingfaceTransformers { + return fmt.Errorf("distributed inference is not supported for runtime %s", runtimeName) + } + existingService := &corev1.Service{} err := resources.GetResource(ctx, wObj.Name, wObj.Namespace, kubeClient, existingService) if err != nil { @@ -79,18 +84,18 @@ func updateTorchParamsForDistributedInference(ctx context.Context, kubeClient cl } nodes := *wObj.Resource.Count - inferenceObj.TorchRunParams["nnodes"] = strconv.Itoa(nodes) - inferenceObj.TorchRunParams["nproc_per_node"] = strconv.Itoa(inferenceObj.WorldSize / nodes) + inferenceParam.Transformers.TorchRunParams["nnodes"] = strconv.Itoa(nodes) + inferenceParam.Transformers.TorchRunParams["nproc_per_node"] = strconv.Itoa(inferenceParam.WorldSize / nodes) if nodes > 1 { - inferenceObj.TorchRunParams["node_rank"] = "$(echo $HOSTNAME | grep -o '[^-]*$')" - inferenceObj.TorchRunParams["master_addr"] = existingService.Spec.ClusterIP - inferenceObj.TorchRunParams["master_port"] = "29500" - } - if inferenceObj.TorchRunRdzvParams != nil { - inferenceObj.TorchRunRdzvParams["max_restarts"] = "3" - inferenceObj.TorchRunRdzvParams["rdzv_id"] = "job" - inferenceObj.TorchRunRdzvParams["rdzv_backend"] = "c10d" - inferenceObj.TorchRunRdzvParams["rdzv_endpoint"] = + inferenceParam.Transformers.TorchRunParams["node_rank"] = "$(echo $HOSTNAME | grep -o '[^-]*$')" + inferenceParam.Transformers.TorchRunParams["master_addr"] = existingService.Spec.ClusterIP + inferenceParam.Transformers.TorchRunParams["master_port"] = "29500" + } + if inferenceParam.Transformers.TorchRunRdzvParams != nil { + inferenceParam.Transformers.TorchRunRdzvParams["max_restarts"] = "3" + inferenceParam.Transformers.TorchRunRdzvParams["rdzv_id"] = "job" + inferenceParam.Transformers.TorchRunRdzvParams["rdzv_backend"] = "c10d" + inferenceParam.Transformers.TorchRunRdzvParams["rdzv_endpoint"] = fmt.Sprintf("%s-0.%s-headless.%s.svc.cluster.local:29500", wObj.Name, wObj.Name, wObj.Namespace) } return nil @@ -122,14 +127,17 @@ func GetInferenceImageInfo(ctx context.Context, workspaceObj *kaitov1alpha1.Work } func CreatePresetInference(ctx context.Context, workspaceObj *kaitov1alpha1.Workspace, revisionNum string, - inferenceObj *model.PresetParam, supportDistributedInference bool, kubeClient client.Client) (client.Object, error) { - if inferenceObj.TorchRunParams != nil && supportDistributedInference { - if err := updateTorchParamsForDistributedInference(ctx, kubeClient, workspaceObj, inferenceObj); err != nil { + model model.Model, kubeClient client.Client) (client.Object, error) { + inferenceParam := model.GetInferenceParameters().DeepCopy() + + if model.SupportDistributedInference() { + if err := updateTorchParamsForDistributedInference(ctx, kubeClient, workspaceObj, inferenceParam); err != nil { // klog.ErrorS(err, "failed to update torch params", "workspace", workspaceObj) return nil, err } } + // additional volume var volumes []corev1.Volume var volumeMounts []corev1.VolumeMount shmVolume, shmVolumeMount := utils.ConfigSHMVolume(*workspaceObj.Resource.Count) @@ -139,24 +147,35 @@ func CreatePresetInference(ctx context.Context, workspaceObj *kaitov1alpha1.Work if shmVolumeMount.Name != "" { volumeMounts = append(volumeMounts, shmVolumeMount) } - if len(workspaceObj.Inference.Adapters) > 0 { adapterVolume, adapterVolumeMount := utils.ConfigAdapterVolume() volumes = append(volumes, adapterVolume) volumeMounts = append(volumeMounts, adapterVolumeMount) } + // resource requirements skuNumGPUs, err := utils.GetSKUNumGPUs(ctx, kubeClient, workspaceObj.Status.WorkerNodes, - workspaceObj.Resource.InstanceType, inferenceObj.GPUCountRequirement) + workspaceObj.Resource.InstanceType, inferenceParam.GPUCountRequirement) if err != nil { return nil, fmt.Errorf("failed to get SKU num GPUs: %v", err) } + resourceReq := corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceName(resources.CapacityNvidiaGPU): resource.MustParse(skuNumGPUs), + }, + Limits: corev1.ResourceList{ + corev1.ResourceName(resources.CapacityNvidiaGPU): resource.MustParse(skuNumGPUs), + }, + } - commands, resourceReq := prepareInferenceParameters(ctx, inferenceObj, skuNumGPUs) - image, imagePullSecrets := GetInferenceImageInfo(ctx, workspaceObj, inferenceObj) + // inference command + runtimeName := kaitov1alpha1.GetWorkspaceRuntimeName(workspaceObj) + commands := inferenceParam.GetInferenceCommand(runtimeName, skuNumGPUs) + + image, imagePullSecrets := GetInferenceImageInfo(ctx, workspaceObj, inferenceParam) var depObj client.Object - if supportDistributedInference { + if model.SupportDistributedInference() { depObj = manifests.GenerateStatefulSetManifest(ctx, workspaceObj, image, imagePullSecrets, *workspaceObj.Resource.Count, commands, containerPorts, livenessProbe, readinessProbe, resourceReq, tolerations, volumes, volumeMounts) } else { @@ -169,25 +188,3 @@ func CreatePresetInference(ctx context.Context, workspaceObj *kaitov1alpha1.Work } return depObj, nil } - -// prepareInferenceParameters builds a PyTorch command: -// torchrun baseCommand -// and sets the GPU resources required for inference. -// Returns the command and resource configuration. -func prepareInferenceParameters(ctx context.Context, inferenceObj *model.PresetParam, skuNumGPUs string) ([]string, corev1.ResourceRequirements) { - torchCommand := utils.BuildCmdStr(inferenceObj.BaseCommand, inferenceObj.TorchRunParams) - torchCommand = utils.BuildCmdStr(torchCommand, inferenceObj.TorchRunRdzvParams) - modelCommand := utils.BuildCmdStr(InferenceFile, inferenceObj.ModelRunParams) - commands := utils.ShellCmd(torchCommand + " " + modelCommand) - - resourceRequirements := corev1.ResourceRequirements{ - Requests: corev1.ResourceList{ - corev1.ResourceName(resources.CapacityNvidiaGPU): resource.MustParse(skuNumGPUs), - }, - Limits: corev1.ResourceList{ - corev1.ResourceName(resources.CapacityNvidiaGPU): resource.MustParse(skuNumGPUs), - }, - } - - return commands, resourceRequirements -} diff --git a/pkg/workspace/inference/preset-inferences_test.go b/pkg/workspace/inference/preset-inferences_test.go index bebbab4e7..de4cf1273 100644 --- a/pkg/workspace/inference/preset-inferences_test.go +++ b/pkg/workspace/inference/preset-inferences_test.go @@ -10,12 +10,10 @@ import ( "strings" "testing" - "github.com/kaito-project/kaito/pkg/utils/consts" - "github.com/kaito-project/kaito/api/v1alpha1" + "github.com/kaito-project/kaito/pkg/utils/consts" "github.com/kaito-project/kaito/pkg/utils/test" - "github.com/kaito-project/kaito/pkg/model" "github.com/kaito-project/kaito/pkg/utils/plugin" "github.com/stretchr/testify/mock" appsv1 "k8s.io/api/apps/v1" @@ -28,6 +26,7 @@ var ValidStrength string = "0.5" func TestCreatePresetInference(t *testing.T) { test.RegisterTestModel() testcases := map[string]struct { + workspace *v1alpha1.Workspace nodeCount int modelName string callMocks func(c *test.MockClient) @@ -37,7 +36,35 @@ func TestCreatePresetInference(t *testing.T) { expectedVolume string }{ - "test-model": { + "test-model/vllm": { + workspace: test.MockWorkspaceWithPresetVLLM, + nodeCount: 1, + modelName: "test-model", + callMocks: func(c *test.MockClient) { + c.On("Create", mock.IsType(context.TODO()), mock.IsType(&appsv1.Deployment{}), mock.Anything).Return(nil) + }, + workload: "Deployment", + // No BaseCommand, TorchRunParams, TorchRunRdzvParams, or ModelRunParams + // So expected cmd consists of shell command and inference file + expectedCmd: "/bin/sh -c python3 /workspace/vllm/inference_api.py", + hasAdapters: false, + }, + + "test-model-with-adapters/vllm": { + workspace: test.MockWorkspaceWithPresetVLLM, + nodeCount: 1, + modelName: "test-model", + callMocks: func(c *test.MockClient) { + c.On("Create", mock.IsType(context.TODO()), mock.IsType(&appsv1.Deployment{}), mock.Anything).Return(nil) + }, + workload: "Deployment", + expectedCmd: "/bin/sh -c python3 /workspace/vllm/inference_api.py", + hasAdapters: true, + expectedVolume: "adapter-volume", + }, + + "test-model/transformers": { + workspace: test.MockWorkspaceWithPreset, nodeCount: 1, modelName: "test-model", callMocks: func(c *test.MockClient) { @@ -46,11 +73,12 @@ func TestCreatePresetInference(t *testing.T) { workload: "Deployment", // No BaseCommand, TorchRunParams, TorchRunRdzvParams, or ModelRunParams // So expected cmd consists of shell command and inference file - expectedCmd: "/bin/sh -c python3 inference_api.py", + expectedCmd: "/bin/sh -c accelerate launch /workspace/tfs/inference_api.py", hasAdapters: false, }, - "test-distributed-model": { + "test-distributed-model/transformers": { + workspace: test.MockWorkspaceDistributedModel, nodeCount: 1, modelName: "test-distributed-model", callMocks: func(c *test.MockClient) { @@ -58,18 +86,19 @@ func TestCreatePresetInference(t *testing.T) { c.On("Create", mock.IsType(context.TODO()), mock.IsType(&appsv1.StatefulSet{}), mock.Anything).Return(nil) }, workload: "StatefulSet", - expectedCmd: "/bin/sh -c python3 inference_api.py", + expectedCmd: "/bin/sh -c accelerate launch --nnodes=1 --nproc_per_node=0 --max_restarts=3 --rdzv_id=job --rdzv_backend=c10d --rdzv_endpoint=testWorkspace-0.testWorkspace-headless.kaito.svc.cluster.local:29500 /workspace/tfs/inference_api.py", hasAdapters: false, }, "test-model-with-adapters": { + workspace: test.MockWorkspaceWithPreset, nodeCount: 1, modelName: "test-model", callMocks: func(c *test.MockClient) { c.On("Create", mock.IsType(context.TODO()), mock.IsType(&appsv1.Deployment{}), mock.Anything).Return(nil) }, workload: "Deployment", - expectedCmd: "/bin/sh -c python3 inference_api.py", + expectedCmd: "/bin/sh -c accelerate launch /workspace/tfs/inference_api.py", hasAdapters: true, expectedVolume: "adapter-volume", }, @@ -81,7 +110,7 @@ func TestCreatePresetInference(t *testing.T) { mockClient := test.NewClient() tc.callMocks(mockClient) - workspace := test.MockWorkspaceWithPreset + workspace := tc.workspace workspace.Resource.Count = &tc.nodeCount expectedSecrets := []string{"fake-secret"} if tc.hasAdapters { @@ -97,15 +126,8 @@ func TestCreatePresetInference(t *testing.T) { } } - useHeadlessSvc := false - - var inferenceObj *model.PresetParam model := plugin.KaitoModelRegister.MustGet(tc.modelName) - inferenceObj = model.GetInferenceParameters() - if strings.Contains(tc.modelName, "distributed") { - useHeadlessSvc = true - } svc := &corev1.Service{ ObjectMeta: v1.ObjectMeta{ Name: workspace.Name, @@ -117,7 +139,7 @@ func TestCreatePresetInference(t *testing.T) { } mockClient.CreateOrUpdateObjectInMap(svc) - createdObject, _ := CreatePresetInference(context.TODO(), workspace, test.MockWorkspaceWithPresetHash, inferenceObj, useHeadlessSvc, mockClient) + createdObject, _ := CreatePresetInference(context.TODO(), workspace, test.MockWorkspaceWithPresetHash, model, mockClient) createdWorkload := "" switch createdObject.(type) { case *appsv1.Deployment: diff --git a/pkg/workspace/tuning/preset-tuning.go b/pkg/workspace/tuning/preset-tuning.go index 5703b1845..2707582aa 100644 --- a/pkg/workspace/tuning/preset-tuning.go +++ b/pkg/workspace/tuning/preset-tuning.go @@ -491,7 +491,7 @@ func handleURLDataSource(ctx context.Context, workspaceObj *kaitov1alpha1.Worksp } func prepareModelRunParameters(ctx context.Context, tuningObj *model.PresetParam) (string, error) { - modelCommand := utils.BuildCmdStr(TuningFile, tuningObj.ModelRunParams) + modelCommand := utils.BuildCmdStr(TuningFile, tuningObj.Transformers.ModelRunParams) return modelCommand, nil } @@ -501,14 +501,14 @@ func prepareModelRunParameters(ctx context.Context, tuningObj *model.PresetParam // Returns the command and resource configuration. func prepareTuningParameters(ctx context.Context, wObj *kaitov1alpha1.Workspace, modelCommand string, tuningObj *model.PresetParam, skuNumGPUs string) ([]string, corev1.ResourceRequirements) { - if tuningObj.TorchRunParams == nil { - tuningObj.TorchRunParams = make(map[string]string) + hfParam := tuningObj.Transformers // Only support Huggingface for now + if hfParam.TorchRunParams == nil { + hfParam.TorchRunParams = make(map[string]string) } // Set # of processes to GPU Count numProcesses := getInstanceGPUCount(wObj.Resource.InstanceType) - tuningObj.TorchRunParams["num_processes"] = fmt.Sprintf("%d", numProcesses) - torchCommand := utils.BuildCmdStr(tuningObj.BaseCommand, tuningObj.TorchRunParams) - torchCommand = utils.BuildCmdStr(torchCommand, tuningObj.TorchRunRdzvParams) + hfParam.TorchRunParams["num_processes"] = fmt.Sprintf("%d", numProcesses) + torchCommand := utils.BuildCmdStr(hfParam.BaseCommand, hfParam.TorchRunParams, hfParam.TorchRunRdzvParams) commands := utils.ShellCmd(torchCommand + " " + modelCommand) resourceRequirements := corev1.ResourceRequirements{ diff --git a/pkg/workspace/tuning/preset-tuning_test.go b/pkg/workspace/tuning/preset-tuning_test.go index 6e1ede930..1522595e6 100644 --- a/pkg/workspace/tuning/preset-tuning_test.go +++ b/pkg/workspace/tuning/preset-tuning_test.go @@ -416,9 +416,13 @@ func TestPrepareTuningParameters(t *testing.T) { }, modelCommand: "model-command", tuningObj: &model.PresetParam{ - BaseCommand: "python train.py", - TorchRunParams: map[string]string{}, - TorchRunRdzvParams: map[string]string{}, + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: "python train.py", + TorchRunParams: map[string]string{}, + TorchRunRdzvParams: map[string]string{}, + }, + }, GPUCountRequirement: "2", }, expectedCommands: []string{"/bin/sh", "-c", "python train.py --num_processes=1 model-command"}, diff --git a/presets/models/falcon/model.go b/presets/models/falcon/model.go index 88487fd63..73854a2f8 100644 --- a/presets/models/falcon/model.go +++ b/presets/models/falcon/model.go @@ -38,17 +38,22 @@ var ( PresetFalcon40BInstructModel = PresetFalcon40BModel + "-instruct" PresetFalconTagMap = map[string]string{ - "Falcon7B": "0.0.6", - "Falcon7BInstruct": "0.0.6", - "Falcon40B": "0.0.7", - "Falcon40BInstruct": "0.0.7", + "Falcon7B": "0.0.7", + "Falcon7BInstruct": "0.0.7", + "Falcon40B": "0.0.8", + "Falcon40BInstruct": "0.0.8", } baseCommandPresetFalconInference = "accelerate launch" - baseCommandPresetFalconTuning = "python3 metrics_server.py & accelerate launch" + baseCommandPresetFalconTuning = "cd /workspace/tfs/ && python3 metrics_server.py & accelerate launch" falconRunParams = map[string]string{ - "torch_dtype": "bfloat16", - "pipeline": "text-generation", + "torch_dtype": "bfloat16", + "pipeline": "text-generation", + "chat_template": "/workspace/chat_templates/falcon-instruct.jinja", + } + falconRunParamsVLLM = map[string]string{ + "dtype": "bfloat16", + "chat-template": "/workspace/chat_templates/falcon-instruct.jinja", } ) @@ -64,11 +69,20 @@ func (*falcon7b) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "14Gi", PerGPUMemoryRequirement: "0Gi", // We run Falcon using native vertical model parallel, no per GPU memory requirement. - TorchRunParams: inference.DefaultAccelerateParams, - ModelRunParams: falconRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetFalconInference, - Tag: PresetFalconTagMap["Falcon7B"], + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetFalconInference, + TorchRunParams: inference.DefaultAccelerateParams, + ModelRunParams: falconRunParams, + }, + VLLM: model.VLLMParam{ + BaseCommand: "python3", + ModelName: "falcon-7b", + ModelRunParams: falconRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + Tag: PresetFalconTagMap["Falcon7B"], } } func (*falcon7b) GetTuningParameters() *model.PresetParam { @@ -79,10 +93,14 @@ func (*falcon7b) GetTuningParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "16Gi", PerGPUMemoryRequirement: "16Gi", - TorchRunParams: tuning.DefaultAccelerateParams, - //ModelRunPrams: falconRunTuningParams, // TODO + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetFalconTuning, + TorchRunParams: tuning.DefaultAccelerateParams, + //ModelRunPrams: falconRunTuningParams, // TODO + }, + }, ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetFalconTuning, Tag: PresetFalconTagMap["Falcon7B"], TuningPerGPUMemoryRequirement: map[string]int{"qlora": 16}, } @@ -107,11 +125,20 @@ func (*falcon7bInst) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "14Gi", PerGPUMemoryRequirement: "0Gi", // We run Falcon using native vertical model parallel, no per GPU memory requirement. - TorchRunParams: inference.DefaultAccelerateParams, - ModelRunParams: falconRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetFalconInference, - Tag: PresetFalconTagMap["Falcon7BInstruct"], + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetFalconInference, + TorchRunParams: inference.DefaultAccelerateParams, + ModelRunParams: falconRunParams, + }, + VLLM: model.VLLMParam{ + BaseCommand: "python3", + ModelName: "falcon-7b-instruct", + ModelRunParams: falconRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + Tag: PresetFalconTagMap["Falcon7BInstruct"], } } @@ -137,13 +164,21 @@ func (*falcon40b) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "2", TotalGPUMemoryRequirement: "90Gi", PerGPUMemoryRequirement: "0Gi", // We run Falcon using native vertical model parallel, no per GPU memory requirement. - TorchRunParams: inference.DefaultAccelerateParams, - ModelRunParams: falconRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetFalconInference, - Tag: PresetFalconTagMap["Falcon40B"], + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetFalconInference, + TorchRunParams: inference.DefaultAccelerateParams, + ModelRunParams: falconRunParams, + }, + VLLM: model.VLLMParam{ + BaseCommand: "python3", + ModelName: "falcon-40b", + ModelRunParams: falconRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + Tag: PresetFalconTagMap["Falcon40B"], } - } func (*falcon40b) GetTuningParameters() *model.PresetParam { return &model.PresetParam{ @@ -153,10 +188,14 @@ func (*falcon40b) GetTuningParameters() *model.PresetParam { GPUCountRequirement: "2", TotalGPUMemoryRequirement: "90Gi", PerGPUMemoryRequirement: "16Gi", - TorchRunParams: tuning.DefaultAccelerateParams, - //ModelRunPrams: falconRunTuningParams, // TODO + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetFalconTuning, + TorchRunParams: tuning.DefaultAccelerateParams, + //ModelRunPrams: falconRunTuningParams, // TODO + }, + }, ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetFalconTuning, Tag: PresetFalconTagMap["Falcon40B"], } } @@ -179,11 +218,20 @@ func (*falcon40bInst) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "2", TotalGPUMemoryRequirement: "90Gi", PerGPUMemoryRequirement: "0Gi", // We run Falcon using native vertical model parallel, no per GPU memory requirement. - TorchRunParams: inference.DefaultAccelerateParams, - ModelRunParams: falconRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetFalconInference, - Tag: PresetFalconTagMap["Falcon40BInstruct"], + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetFalconInference, + TorchRunParams: inference.DefaultAccelerateParams, + ModelRunParams: falconRunParams, + }, + VLLM: model.VLLMParam{ + BaseCommand: "python3", + ModelName: "falcon-40b-instruct", + ModelRunParams: falconRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + Tag: PresetFalconTagMap["Falcon40BInstruct"], } } func (*falcon40bInst) GetTuningParameters() *model.PresetParam { diff --git a/presets/models/llama2/model.go b/presets/models/llama2/model.go index 7a81d679f..fa741c855 100644 --- a/presets/models/llama2/model.go +++ b/presets/models/llama2/model.go @@ -32,6 +32,9 @@ var ( "max_seq_len": "512", "max_batch_size": "8", } + llamaRunParamsVLLM = map[string]string{ + "max-seq-len-to-capture": "512", + } ) var llama2A llama2Text7b @@ -46,15 +49,22 @@ func (*llama2Text7b) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "14Gi", PerGPUMemoryRequirement: "14Gi", // We run llama2 using tensor parallelism, the memory of each GPU needs to be bigger than the tensor shard size. - TorchRunParams: inference.DefaultTorchRunParams, - TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams, - ModelRunParams: llamaRunParams, - ReadinessTimeout: time.Duration(10) * time.Minute, - BaseCommand: baseCommandPresetLlama, - WorldSize: 1, + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetLlama, + TorchRunParams: inference.DefaultTorchRunParams, + TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams, + ModelRunParams: llamaRunParams, + }, + VLLM: model.VLLMParam{ + BaseCommand: "python3", + ModelRunParams: llamaRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(10) * time.Minute, + WorldSize: 1, // Tag: llama has private image access mode. The image tag is determined by the user. } - } func (*llama2Text7b) GetTuningParameters() *model.PresetParam { return nil // Currently doesn't support fine-tuning @@ -78,12 +88,20 @@ func (*llama2Text13b) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "2", TotalGPUMemoryRequirement: "30Gi", PerGPUMemoryRequirement: "15Gi", // We run llama2 using tensor parallelism, the memory of each GPU needs to be bigger than the tensor shard size. - TorchRunParams: inference.DefaultTorchRunParams, - TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams, - ModelRunParams: llamaRunParams, - ReadinessTimeout: time.Duration(20) * time.Minute, - BaseCommand: baseCommandPresetLlama, - WorldSize: 2, + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetLlama, + TorchRunParams: inference.DefaultTorchRunParams, + TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams, + ModelRunParams: llamaRunParams, + }, + VLLM: model.VLLMParam{ + BaseCommand: "python3", + ModelRunParams: llamaRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(20) * time.Minute, + WorldSize: 2, // Tag: llama has private image access mode. The image tag is determined by the user. } } @@ -109,12 +127,20 @@ func (*llama2Text70b) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "8", TotalGPUMemoryRequirement: "152Gi", PerGPUMemoryRequirement: "19Gi", // We run llama2 using tensor parallelism, the memory of each GPU needs to be bigger than the tensor shard size. - TorchRunParams: inference.DefaultTorchRunParams, - TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams, - ModelRunParams: llamaRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetLlama, - WorldSize: 8, + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetLlama, + TorchRunParams: inference.DefaultTorchRunParams, + TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams, + ModelRunParams: llamaRunParams, + }, + VLLM: model.VLLMParam{ + BaseCommand: "python3", + ModelRunParams: llamaRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + WorldSize: 8, // Tag: llama has private image access mode. The image tag is determined by the user. } } diff --git a/presets/models/llama2chat/model.go b/presets/models/llama2chat/model.go index 735ecc013..93d2b045d 100644 --- a/presets/models/llama2chat/model.go +++ b/presets/models/llama2chat/model.go @@ -32,6 +32,9 @@ var ( "max_seq_len": "512", "max_batch_size": "8", } + llamaRunParamsVLLM = map[string]string{ + "max-seq-len-to-capture": "512", + } ) var llama2chatA llama2Chat7b @@ -46,12 +49,20 @@ func (*llama2Chat7b) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "16Gi", PerGPUMemoryRequirement: "14Gi", // We run llama2 using tensor parallelism, the memory of each GPU needs to be bigger than the tensor shard size. - TorchRunParams: inference.DefaultTorchRunParams, - TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams, - ModelRunParams: llamaRunParams, - ReadinessTimeout: time.Duration(10) * time.Minute, - BaseCommand: baseCommandPresetLlama, - WorldSize: 1, + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetLlama, + TorchRunParams: inference.DefaultTorchRunParams, + TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams, + ModelRunParams: llamaRunParams, + }, + VLLM: model.VLLMParam{ + BaseCommand: "python3", + ModelRunParams: llamaRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(10) * time.Minute, + WorldSize: 1, // Tag: llama has private image access mode. The image tag is determined by the user. } } @@ -77,12 +88,21 @@ func (*llama2Chat13b) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "2", TotalGPUMemoryRequirement: "30Gi", PerGPUMemoryRequirement: "15Gi", // We run llama2 using tensor parallelism, the memory of each GPU needs to be bigger than the tensor shard size. - TorchRunParams: inference.DefaultTorchRunParams, - TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams, - ModelRunParams: llamaRunParams, - ReadinessTimeout: time.Duration(20) * time.Minute, - BaseCommand: baseCommandPresetLlama, - WorldSize: 2, + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetLlama, + TorchRunParams: inference.DefaultTorchRunParams, + TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams, + ModelRunParams: llamaRunParams, + }, + VLLM: model.VLLMParam{ + BaseCommand: "python3", + ModelRunParams: llamaRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(20) * time.Minute, + + WorldSize: 2, // Tag: llama has private image access mode. The image tag is determined by the user. } } @@ -108,12 +128,20 @@ func (*llama2Chat70b) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "8", TotalGPUMemoryRequirement: "192Gi", PerGPUMemoryRequirement: "19Gi", // We run llama2 using tensor parallelism, the memory of each GPU needs to be bigger than the tensor shard size. - TorchRunParams: inference.DefaultTorchRunParams, - TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams, - ModelRunParams: llamaRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetLlama, - WorldSize: 8, + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetLlama, + TorchRunParams: inference.DefaultTorchRunParams, + TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams, + ModelRunParams: llamaRunParams, + }, + VLLM: model.VLLMParam{ + BaseCommand: "python3", + ModelRunParams: llamaRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + WorldSize: 8, // Tag: llama has private image access mode. The image tag is determined by the user. } } diff --git a/presets/models/mistral/model.go b/presets/models/mistral/model.go index 51a23b10c..76d929067 100644 --- a/presets/models/mistral/model.go +++ b/presets/models/mistral/model.go @@ -27,15 +27,20 @@ var ( PresetMistral7BInstructModel = PresetMistral7BModel + "-instruct" PresetMistralTagMap = map[string]string{ - "Mistral7B": "0.0.7", - "Mistral7BInstruct": "0.0.7", + "Mistral7B": "0.0.8", + "Mistral7BInstruct": "0.0.8", } baseCommandPresetMistralInference = "accelerate launch" - baseCommandPresetMistralTuning = "python3 metrics_server.py & accelerate launch" + baseCommandPresetMistralTuning = "cd /workspace/tfs/ && python3 metrics_server.py & accelerate launch" mistralRunParams = map[string]string{ - "torch_dtype": "bfloat16", - "pipeline": "text-generation", + "torch_dtype": "bfloat16", + "pipeline": "text-generation", + "chat_template": "/workspace/chat_templates/mistral-instruct.jinja", + } + mistralRunParamsVLLM = map[string]string{ + "dtype": "bfloat16", + "chat-template": "/workspace/chat_templates/mistral-instruct.jinja", } ) @@ -51,11 +56,20 @@ func (*mistral7b) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "14Gi", PerGPUMemoryRequirement: "0Gi", // We run Mistral using native vertical model parallel, no per GPU memory requirement. - TorchRunParams: inference.DefaultAccelerateParams, - ModelRunParams: mistralRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetMistralInference, - Tag: PresetMistralTagMap["Mistral7B"], + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + TorchRunParams: inference.DefaultAccelerateParams, + ModelRunParams: mistralRunParams, + BaseCommand: baseCommandPresetMistralInference, + }, + VLLM: model.VLLMParam{ + BaseCommand: "python3", + ModelName: "mistral-7b", + ModelRunParams: mistralRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + Tag: PresetMistralTagMap["Mistral7B"], } } @@ -67,10 +81,14 @@ func (*mistral7b) GetTuningParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "16Gi", PerGPUMemoryRequirement: "16Gi", - //TorchRunParams: tuning.DefaultAccelerateParams, - //ModelRunParams: mistralRunParams, + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + //TorchRunParams: tuning.DefaultAccelerateParams, + //ModelRunParams: mistralRunParams, + BaseCommand: baseCommandPresetMistralTuning, + }, + }, ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetMistralTuning, Tag: PresetMistralTagMap["Mistral7B"], } } @@ -94,11 +112,20 @@ func (*mistral7bInst) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "16Gi", PerGPUMemoryRequirement: "0Gi", // We run mistral using native vertical model parallel, no per GPU memory requirement. - TorchRunParams: inference.DefaultAccelerateParams, - ModelRunParams: mistralRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetMistralInference, - Tag: PresetMistralTagMap["Mistral7BInstruct"], + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + TorchRunParams: inference.DefaultAccelerateParams, + ModelRunParams: mistralRunParams, + BaseCommand: baseCommandPresetMistralInference, + }, + VLLM: model.VLLMParam{ + BaseCommand: "python3", + ModelName: "mistral-7b-instruct", + ModelRunParams: mistralRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + Tag: PresetMistralTagMap["Mistral7BInstruct"], } } diff --git a/presets/models/phi2/model.go b/presets/models/phi2/model.go index 31f495c7e..c708792e8 100644 --- a/presets/models/phi2/model.go +++ b/presets/models/phi2/model.go @@ -22,15 +22,18 @@ var ( PresetPhi2Model = "phi-2" PresetPhiTagMap = map[string]string{ - "Phi2": "0.0.5", + "Phi2": "0.0.6", } baseCommandPresetPhiInference = "accelerate launch" - baseCommandPresetPhiTuning = "python3 metrics_server.py & accelerate launch" + baseCommandPresetPhiTuning = "cd /workspace/tfs/ && python3 metrics_server.py & accelerate launch" phiRunParams = map[string]string{ "torch_dtype": "float16", "pipeline": "text-generation", } + phiRunParamsVLLM = map[string]string{ + "dtype": "float16", + } ) var phiA phi2 @@ -45,11 +48,20 @@ func (*phi2) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "12Gi", PerGPUMemoryRequirement: "0Gi", // We run Phi using native vertical model parallel, no per GPU memory requirement. - TorchRunParams: inference.DefaultAccelerateParams, - ModelRunParams: phiRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetPhiInference, - Tag: PresetPhiTagMap["Phi2"], + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + TorchRunParams: inference.DefaultAccelerateParams, + ModelRunParams: phiRunParams, + BaseCommand: baseCommandPresetPhiInference, + }, + VLLM: model.VLLMParam{ + BaseCommand: "python3", + ModelName: "phi-2", + ModelRunParams: phiRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + Tag: PresetPhiTagMap["Phi2"], } } func (*phi2) GetTuningParameters() *model.PresetParam { @@ -60,10 +72,14 @@ func (*phi2) GetTuningParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "16Gi", PerGPUMemoryRequirement: "16Gi", - // TorchRunParams: inference.DefaultAccelerateParams, - // ModelRunParams: phiRunParams, + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + // TorchRunParams: inference.DefaultAccelerateParams, + // ModelRunParams: phiRunParams, + BaseCommand: baseCommandPresetPhiTuning, + }, + }, ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetPhiTuning, Tag: PresetPhiTagMap["Phi2"], } } diff --git a/presets/models/phi3/model.go b/presets/models/phi3/model.go index abf63003b..aa105eab5 100644 --- a/presets/models/phi3/model.go +++ b/presets/models/phi3/model.go @@ -42,20 +42,23 @@ var ( PresetPhi3_5MiniInstruct = "phi-3.5-mini-instruct" PresetPhiTagMap = map[string]string{ - "Phi3Mini4kInstruct": "0.0.2", - "Phi3Mini128kInstruct": "0.0.2", - "Phi3Medium4kInstruct": "0.0.2", - "Phi3Medium128kInstruct": "0.0.2", + "Phi3Mini4kInstruct": "0.0.3", + "Phi3Mini128kInstruct": "0.0.3", + "Phi3Medium4kInstruct": "0.0.3", + "Phi3Medium128kInstruct": "0.0.3", "Phi3_5MiniInstruct": "0.0.1", } baseCommandPresetPhiInference = "accelerate launch" - baseCommandPresetPhiTuning = "python3 metrics_server.py & accelerate launch" + baseCommandPresetPhiTuning = "cd /workspace/tfs/ && python3 metrics_server.py & accelerate launch" phiRunParams = map[string]string{ "torch_dtype": "auto", "pipeline": "text-generation", "trust_remote_code": "", } + phiRunParamsVLLM = map[string]string{ + "dtype": "auto", + } ) var phi3MiniA phi3Mini4KInst @@ -70,11 +73,20 @@ func (*phi3Mini4KInst) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "9Gi", PerGPUMemoryRequirement: "0Gi", // We run Phi using native vertical model parallel, no per GPU memory requirement. - TorchRunParams: inference.DefaultAccelerateParams, - ModelRunParams: phiRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetPhiInference, - Tag: PresetPhiTagMap["Phi3Mini4kInstruct"], + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetPhiInference, + TorchRunParams: inference.DefaultAccelerateParams, + ModelRunParams: phiRunParams, + }, + VLLM: model.VLLMParam{ + BaseCommand: "python3", + ModelName: "phi-3-mini-4k-instruct", + ModelRunParams: phiRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + Tag: PresetPhiTagMap["Phi3Mini4kInstruct"], } } func (*phi3Mini4KInst) GetTuningParameters() *model.PresetParam { @@ -88,8 +100,12 @@ func (*phi3Mini4KInst) GetTuningParameters() *model.PresetParam { // TorchRunParams: inference.DefaultAccelerateParams, // ModelRunParams: phiRunParams, ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetPhiTuning, - Tag: PresetPhiTagMap["Phi3Mini4kInstruct"], + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetPhiTuning, + }, + }, + Tag: PresetPhiTagMap["Phi3Mini4kInstruct"], } } func (*phi3Mini4KInst) SupportDistributedInference() bool { return false } @@ -109,11 +125,20 @@ func (*phi3Mini128KInst) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "9Gi", PerGPUMemoryRequirement: "0Gi", // We run Phi using native vertical model parallel, no per GPU memory requirement. - TorchRunParams: inference.DefaultAccelerateParams, - ModelRunParams: phiRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetPhiInference, - Tag: PresetPhiTagMap["Phi3Mini128kInstruct"], + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetPhiInference, + TorchRunParams: inference.DefaultAccelerateParams, + ModelRunParams: phiRunParams, + }, + VLLM: model.VLLMParam{ + BaseCommand: "python3", + ModelName: "phi-3-mini-128k-instruct", + ModelRunParams: phiRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + Tag: PresetPhiTagMap["Phi3Mini128kInstruct"], } } func (*phi3Mini128KInst) GetTuningParameters() *model.PresetParam { @@ -124,11 +149,13 @@ func (*phi3Mini128KInst) GetTuningParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "72Gi", PerGPUMemoryRequirement: "72Gi", - // TorchRunParams: inference.DefaultAccelerateParams, - // ModelRunParams: phiRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetPhiTuning, - Tag: PresetPhiTagMap["Phi3Mini128kInstruct"], + ReadinessTimeout: time.Duration(30) * time.Minute, + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetPhiTuning, + }, + }, + Tag: PresetPhiTagMap["Phi3Mini128kInstruct"], } } func (*phi3Mini128KInst) SupportDistributedInference() bool { return false } @@ -148,11 +175,20 @@ func (*phi3_5MiniInst) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "8Gi", PerGPUMemoryRequirement: "0Gi", // We run Phi using native vertical model parallel, no per GPU memory requirement. - TorchRunParams: inference.DefaultAccelerateParams, - ModelRunParams: phiRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetPhiInference, - Tag: PresetPhiTagMap["Phi3_5MiniInstruct"], + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetPhiInference, + TorchRunParams: inference.DefaultAccelerateParams, + ModelRunParams: phiRunParams, + }, + VLLM: model.VLLMParam{ + BaseCommand: "python3", + ModelName: "phi-3.5-mini-instruct", + ModelRunParams: phiRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + Tag: PresetPhiTagMap["Phi3_5MiniInstruct"], } } func (*phi3_5MiniInst) GetTuningParameters() *model.PresetParam { @@ -166,8 +202,12 @@ func (*phi3_5MiniInst) GetTuningParameters() *model.PresetParam { // TorchRunParams: inference.DefaultAccelerateParams, // ModelRunParams: phiRunParams, ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetPhiTuning, - Tag: PresetPhiTagMap["Phi3_5MiniInstruct"], + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetPhiTuning, + }, + }, + Tag: PresetPhiTagMap["Phi3_5MiniInstruct"], } } func (*phi3_5MiniInst) SupportDistributedInference() bool { return false } @@ -187,11 +227,20 @@ func (*Phi3Medium4kInstruct) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "28Gi", PerGPUMemoryRequirement: "0Gi", // We run Phi using native vertical model parallel, no per GPU memory requirement. - TorchRunParams: inference.DefaultAccelerateParams, - ModelRunParams: phiRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetPhiInference, - Tag: PresetPhiTagMap["Phi3Medium4kInstruct"], + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetPhiInference, + TorchRunParams: inference.DefaultAccelerateParams, + ModelRunParams: phiRunParams, + }, + VLLM: model.VLLMParam{ + BaseCommand: "python3", + ModelName: "phi-3-medium-4k-instruct", + ModelRunParams: phiRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + Tag: PresetPhiTagMap["Phi3Medium4kInstruct"], } } func (*Phi3Medium4kInstruct) GetTuningParameters() *model.PresetParam { @@ -205,8 +254,12 @@ func (*Phi3Medium4kInstruct) GetTuningParameters() *model.PresetParam { // TorchRunParams: inference.DefaultAccelerateParams, // ModelRunParams: phiRunParams, ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetPhiTuning, - Tag: PresetPhiTagMap["Phi3Medium4kInstruct"], + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetPhiTuning, + }, + }, + Tag: PresetPhiTagMap["Phi3Medium4kInstruct"], } } func (*Phi3Medium4kInstruct) SupportDistributedInference() bool { return false } @@ -226,11 +279,20 @@ func (*Phi3Medium128kInstruct) GetInferenceParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "28Gi", PerGPUMemoryRequirement: "0Gi", // We run Phi using native vertical model parallel, no per GPU memory requirement. - TorchRunParams: inference.DefaultAccelerateParams, - ModelRunParams: phiRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetPhiInference, - Tag: PresetPhiTagMap["Phi3Medium128kInstruct"], + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetPhiInference, + TorchRunParams: inference.DefaultAccelerateParams, + ModelRunParams: phiRunParams, + }, + VLLM: model.VLLMParam{ + BaseCommand: "python3", + ModelName: "phi-3-medium-128k-instruct", + ModelRunParams: phiRunParamsVLLM, + }, + }, + ReadinessTimeout: time.Duration(30) * time.Minute, + Tag: PresetPhiTagMap["Phi3Medium128kInstruct"], } } func (*Phi3Medium128kInstruct) GetTuningParameters() *model.PresetParam { @@ -241,11 +303,13 @@ func (*Phi3Medium128kInstruct) GetTuningParameters() *model.PresetParam { GPUCountRequirement: "1", TotalGPUMemoryRequirement: "80Gi", PerGPUMemoryRequirement: "80Gi", - // TorchRunParams: inference.DefaultAccelerateParams, - // ModelRunParams: phiRunParams, - ReadinessTimeout: time.Duration(30) * time.Minute, - BaseCommand: baseCommandPresetPhiTuning, - Tag: PresetPhiTagMap["Phi3Medium128kInstruct"], + ReadinessTimeout: time.Duration(30) * time.Minute, + RuntimeParam: model.RuntimeParam{ + Transformers: model.HuggingfaceTransformersParam{ + BaseCommand: baseCommandPresetPhiTuning, + }, + }, + Tag: PresetPhiTagMap["Phi3Medium128kInstruct"], } } func (*Phi3Medium128kInstruct) SupportDistributedInference() bool { return false }