From 0a2e3a7bb8effdbca56405c6da6b0626ce3cbd17 Mon Sep 17 00:00:00 2001
From: jerryzhuang <zhuangqhc@gmail.com>
Date: Thu, 17 Oct 2024 17:42:13 +1100
Subject: [PATCH] feat: support vllm in controller

- set vllm as the default runtime by a featureflag

Signed-off-by: jerryzhuang <zhuangqhc@gmail.com>
---
 api/v1alpha1/labels.go                        |  30 ++++
 api/v1alpha1/workspace_validation.go          |   4 +-
 pkg/featuregates/featuregates.go              |   1 +
 pkg/model/interface.go                        | 144 ++++++++++++++--
 pkg/utils/common-preset.go                    |   5 -
 pkg/utils/common.go                           |  14 +-
 pkg/utils/common_test.go                      |   2 +-
 pkg/utils/consts/consts.go                    |   5 +-
 pkg/utils/plugin/plugin.go                    |   4 +
 pkg/utils/test/testModel.go                   |  22 ++-
 pkg/utils/test/testUtils.go                   |  26 +++
 .../controllers/workspace_controller.go       |   2 +-
 .../inference/preset-inference-types.go       |   3 +
 pkg/workspace/inference/preset-inferences.go  |  87 +++++-----
 .../inference/preset-inferences_test.go       |  56 ++++--
 pkg/workspace/tuning/preset-tuning.go         |  12 +-
 pkg/workspace/tuning/preset-tuning_test.go    |  10 +-
 presets/workspace/models/falcon/model.go      | 120 +++++++++----
 presets/workspace/models/llama2/model.go      |  52 +++---
 presets/workspace/models/llama2chat/model.go  |  52 ++++--
 presets/workspace/models/mistral/model.go     |  65 +++++--
 presets/workspace/models/phi2/model.go        |  37 ++--
 presets/workspace/models/phi3/model.go        | 160 +++++++++++++-----
 23 files changed, 664 insertions(+), 249 deletions(-)

diff --git a/api/v1alpha1/labels.go b/api/v1alpha1/labels.go
index 7e807c29a..e66c8d427 100644
--- a/api/v1alpha1/labels.go
+++ b/api/v1alpha1/labels.go
@@ -3,6 +3,12 @@
 
 package v1alpha1
 
+import (
+	"github.com/kaito-project/kaito/pkg/featuregates"
+	"github.com/kaito-project/kaito/pkg/model"
+	"github.com/kaito-project/kaito/pkg/utils/consts"
+)
+
 const (
 
 	// Non-prefixed labels/annotations are reserved for end-use.
@@ -30,4 +36,28 @@ const (
 
 	// RAGEngineRevisionAnnotation is the Annotations for revision number
 	RAGEngineRevisionAnnotation = "ragengine.kaito.io/revision"
+
+	// AnnotationWorkspaceRuntime is the annotation for runtime selection.
+	AnnotationWorkspaceRuntime = KAITOPrefix + "runtime"
 )
+
+// GetWorkspaceRuntimeName returns the runtime name of the workspace.
+func GetWorkspaceRuntimeName(ws *Workspace) model.RuntimeName {
+	if ws == nil {
+		panic("workspace is nil")
+	}
+	runtime := model.RuntimeNameHuggingfaceTransformers
+	if featuregates.FeatureGates[consts.FeatureFlagVLLM] {
+		runtime = model.RuntimeNameVLLM
+	}
+
+	name := ws.Annotations[AnnotationWorkspaceRuntime]
+	switch name {
+	case string(model.RuntimeNameHuggingfaceTransformers):
+		runtime = model.RuntimeNameHuggingfaceTransformers
+	case string(model.RuntimeNameVLLM):
+		runtime = model.RuntimeNameVLLM
+	}
+
+	return runtime
+}
diff --git a/api/v1alpha1/workspace_validation.go b/api/v1alpha1/workspace_validation.go
index 9a4183c69..703322c26 100644
--- a/api/v1alpha1/workspace_validation.go
+++ b/api/v1alpha1/workspace_validation.go
@@ -169,7 +169,7 @@ func (r *TuningSpec) validateCreate(ctx context.Context, workspaceNamespace stri
 	// Currently require a preset to specified, in future we can consider defining a template
 	if r.Preset == nil {
 		errs = errs.Also(apis.ErrMissingField("Preset"))
-	} else if presetName := string(r.Preset.Name); !utils.IsValidPreset(presetName) {
+	} else if presetName := string(r.Preset.Name); !plugin.IsValidPreset(presetName) {
 		errs = errs.Also(apis.ErrInvalidValue(fmt.Sprintf("Unsupported tuning preset name %s", presetName), "presetName"))
 	}
 	return errs
@@ -407,7 +407,7 @@ func (i *InferenceSpec) validateCreate() (errs *apis.FieldError) {
 	if i.Preset != nil {
 		presetName := string(i.Preset.Name)
 		// Validate preset name
-		if !utils.IsValidPreset(presetName) {
+		if !plugin.IsValidPreset(presetName) {
 			errs = errs.Also(apis.ErrInvalidValue(fmt.Sprintf("Unsupported inference preset name %s", presetName), "presetName"))
 		}
 		// Validate private preset has private image specified
diff --git a/pkg/featuregates/featuregates.go b/pkg/featuregates/featuregates.go
index 2161210e1..3fea1b2f5 100644
--- a/pkg/featuregates/featuregates.go
+++ b/pkg/featuregates/featuregates.go
@@ -15,6 +15,7 @@ var (
 	// FeatureGates is a map that holds	the feature gates and their default values for Kaito.
 	FeatureGates = map[string]bool{
 		consts.FeatureFlagKarpenter: false,
+		consts.FeatureFlagVLLM:      false,
 		//	Add more feature gates here
 	}
 )
diff --git a/pkg/model/interface.go b/pkg/model/interface.go
index 56c925698..db172ac2f 100644
--- a/pkg/model/interface.go
+++ b/pkg/model/interface.go
@@ -4,6 +4,8 @@ package model
 
 import (
 	"time"
+
+	"github.com/kaito-project/kaito/pkg/utils"
 )
 
 type Model interface {
@@ -13,23 +15,139 @@ type Model interface {
 	SupportTuning() bool
 }
 
+// RuntimeName is LLM runtime name.
+type RuntimeName string
+
+const (
+	RuntimeNameHuggingfaceTransformers RuntimeName = "transformers"
+	RuntimeNameVLLM                    RuntimeName = "vllm"
+)
+
 // PresetParam defines the preset inference parameters for a model.
 type PresetParam struct {
-	ModelFamilyName               string            // The name of the model family.
-	ImageAccessMode               string            // Defines where the Image is Public or Private.
-	DiskStorageRequirement        string            // Disk storage requirements for the model.
-	GPUCountRequirement           string            // Number of GPUs required for the Preset. Used for inference.
-	TotalGPUMemoryRequirement     string            // Total GPU memory required for the Preset. Used for inference.
-	PerGPUMemoryRequirement       string            // GPU memory required per GPU. Used for inference.
-	TuningPerGPUMemoryRequirement map[string]int    // Min GPU memory per tuning method (batch size 1). Used for tuning.
-	TorchRunParams                map[string]string // Parameters for configuring the torchrun command.
-	TorchRunRdzvParams            map[string]string // Optional rendezvous parameters for distributed training/inference using torchrun (elastic).
-	BaseCommand                   string            // The initial command (e.g., 'torchrun', 'accelerate launch') used in the command line.
-	ModelRunParams                map[string]string // Parameters for running the model training/inference.
+	Tag             string // The model image tag
+	ModelFamilyName string // The name of the model family.
+	ImageAccessMode string // Defines where the Image is Public or Private.
+
+	DiskStorageRequirement        string         // Disk storage requirements for the model.
+	GPUCountRequirement           string         // Number of GPUs required for the Preset. Used for inference.
+	TotalGPUMemoryRequirement     string         // Total GPU memory required for the Preset. Used for inference.
+	PerGPUMemoryRequirement       string         // GPU memory required per GPU. Used for inference.
+	TuningPerGPUMemoryRequirement map[string]int // Min GPU memory per tuning method (batch size 1). Used for tuning.
+	WorldSize                     int            // Defines the number of processes required for distributed inference.
+
+	RuntimeParam
+
 	// ReadinessTimeout defines the maximum duration for creating the workload.
 	// This timeout accommodates the size of the image, ensuring pull completion
 	// even under slower network conditions or unforeseen delays.
 	ReadinessTimeout time.Duration
-	WorldSize        int    // Defines the number of processes required for distributed inference.
-	Tag              string // The model image tag
+}
+
+// RuntimeParam defines the llm runtime parameters.
+type RuntimeParam struct {
+	Transformers HuggingfaceTransformersParam
+	VLLM         VLLMParam
+}
+
+type HuggingfaceTransformersParam struct {
+	BaseCommand        string            // The initial command (e.g., 'torchrun', 'accelerate launch') used in the command line.
+	TorchRunParams     map[string]string // Parameters for configuring the torchrun command.
+	TorchRunRdzvParams map[string]string // Optional rendezvous parameters for distributed training/inference using torchrun (elastic).
+	InferenceMainFile  string            // The main file for inference.
+	ModelRunParams     map[string]string // Parameters for running the model training/inference.
+}
+
+type VLLMParam struct {
+	BaseCommand string
+	// The model name used in the openai serving API.
+	// see https://platform.openai.com/docs/api-reference/chat/create#chat-create-model.
+	ModelName string
+	// Parameters for distributed inference.
+	DistributionParams map[string]string
+	// Parameters for running the model training/inference.
+	ModelRunParams map[string]string
+}
+
+func (p *PresetParam) DeepCopy() *PresetParam {
+	if p == nil {
+		return nil
+	}
+	out := new(PresetParam)
+	*out = *p
+	out.RuntimeParam = p.RuntimeParam.DeepCopy()
+	out.TuningPerGPUMemoryRequirement = make(map[string]int, len(p.TuningPerGPUMemoryRequirement))
+	for k, v := range p.TuningPerGPUMemoryRequirement {
+		out.TuningPerGPUMemoryRequirement[k] = v
+	}
+	return out
+}
+
+func (rp *RuntimeParam) DeepCopy() RuntimeParam {
+	if rp == nil {
+		return RuntimeParam{}
+	}
+	out := RuntimeParam{}
+	out.Transformers = rp.Transformers.DeepCopy()
+	out.VLLM = rp.VLLM.DeepCopy()
+	return out
+}
+
+func (h *HuggingfaceTransformersParam) DeepCopy() HuggingfaceTransformersParam {
+	if h == nil {
+		return HuggingfaceTransformersParam{}
+	}
+	out := HuggingfaceTransformersParam{}
+	out.BaseCommand = h.BaseCommand
+	out.TorchRunParams = make(map[string]string, len(h.TorchRunParams))
+	for k, v := range h.TorchRunParams {
+		out.TorchRunParams[k] = v
+	}
+	out.TorchRunRdzvParams = make(map[string]string, len(h.TorchRunRdzvParams))
+	for k, v := range h.TorchRunRdzvParams {
+		out.TorchRunRdzvParams[k] = v
+	}
+	out.ModelRunParams = make(map[string]string, len(h.ModelRunParams))
+	for k, v := range h.ModelRunParams {
+		out.ModelRunParams[k] = v
+	}
+	return out
+}
+
+func (v *VLLMParam) DeepCopy() VLLMParam {
+	if v == nil {
+		return VLLMParam{}
+	}
+	out := VLLMParam{}
+	out.BaseCommand = v.BaseCommand
+	out.ModelName = v.ModelName
+	out.DistributionParams = make(map[string]string, len(v.DistributionParams))
+	for k, v := range v.DistributionParams {
+		out.DistributionParams[k] = v
+	}
+	out.ModelRunParams = make(map[string]string, len(v.ModelRunParams))
+	for k, v := range v.ModelRunParams {
+		out.ModelRunParams[k] = v
+	}
+	return out
+}
+
+// builds the container command:
+// eg. torchrun <TORCH_PARAMS> <OPTIONAL_RDZV_PARAMS> baseCommand <MODEL_PARAMS>
+func (p *PresetParam) GetInferenceCommand(runtime RuntimeName, skuNumGPUs string) []string {
+	switch runtime {
+	case RuntimeNameHuggingfaceTransformers:
+		torchCommand := utils.BuildCmdStr(p.Transformers.BaseCommand, p.Transformers.TorchRunParams, p.Transformers.TorchRunRdzvParams)
+		modelCommand := utils.BuildCmdStr(p.Transformers.InferenceMainFile, p.Transformers.ModelRunParams)
+		return utils.ShellCmd(torchCommand + " " + modelCommand)
+	case RuntimeNameVLLM:
+		if p.VLLM.ModelName != "" {
+			p.VLLM.ModelRunParams["served-model-name"] = p.VLLM.ModelName
+		}
+		p.VLLM.ModelRunParams["tensor-parallel-size"] = skuNumGPUs
+		modelCommand := utils.BuildCmdStr(p.VLLM.BaseCommand, p.VLLM.ModelRunParams)
+		return utils.ShellCmd(modelCommand)
+	default:
+		return nil
+	}
 }
diff --git a/pkg/utils/common-preset.go b/pkg/utils/common-preset.go
index 3a43f6d0d..c9f5f8dd0 100644
--- a/pkg/utils/common-preset.go
+++ b/pkg/utils/common-preset.go
@@ -3,7 +3,6 @@
 package utils
 
 import (
-	"github.com/kaito-project/kaito/pkg/utils/plugin"
 	corev1 "k8s.io/api/core/v1"
 )
 
@@ -150,7 +149,3 @@ func ConfigAdapterVolume() (corev1.Volume, corev1.VolumeMount) {
 	}
 	return volume, volumeMount
 }
-
-func IsValidPreset(preset string) bool {
-	return plugin.KaitoModelRegister.Has(preset)
-}
diff --git a/pkg/utils/common.go b/pkg/utils/common.go
index af0c7e4cf..cda6e7d9a 100644
--- a/pkg/utils/common.go
+++ b/pkg/utils/common.go
@@ -68,13 +68,15 @@ func MergeConfigMaps(baseMap, overrideMap map[string]string) map[string]string {
 	return merged
 }
 
-func BuildCmdStr(baseCommand string, runParams map[string]string) string {
+func BuildCmdStr(baseCommand string, runParams ...map[string]string) string {
 	updatedBaseCommand := baseCommand
-	for key, value := range runParams {
-		if value == "" {
-			updatedBaseCommand = fmt.Sprintf("%s --%s", updatedBaseCommand, key)
-		} else {
-			updatedBaseCommand = fmt.Sprintf("%s --%s=%s", updatedBaseCommand, key, value)
+	for _, runParam := range runParams {
+		for key, value := range runParam {
+			if value == "" {
+				updatedBaseCommand = fmt.Sprintf("%s --%s", updatedBaseCommand, key)
+			} else {
+				updatedBaseCommand = fmt.Sprintf("%s --%s=%s", updatedBaseCommand, key, value)
+			}
 		}
 	}
 
diff --git a/pkg/utils/common_test.go b/pkg/utils/common_test.go
index e23997692..b214150e7 100644
--- a/pkg/utils/common_test.go
+++ b/pkg/utils/common_test.go
@@ -2,7 +2,6 @@ package utils
 
 import (
 	"context"
-	"sigs.k8s.io/controller-runtime/pkg/client"
 	"testing"
 
 	"github.com/stretchr/testify/assert"
@@ -12,6 +11,7 @@ import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/client-go/kubernetes/scheme"
+	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/client/fake"
 )
 
diff --git a/pkg/utils/consts/consts.go b/pkg/utils/consts/consts.go
index 6756302d5..bcb1b8c58 100644
--- a/pkg/utils/consts/consts.go
+++ b/pkg/utils/consts/consts.go
@@ -11,7 +11,6 @@ const (
 	// RAGEngineFinalizer is used to make sure that ragengine controller handles garbage collection.
 	RAGEngineFinalizer            = "ragengine.finalizer.kaito.sh"
 	DefaultReleaseNamespaceEnvVar = "RELEASE_NAMESPACE"
-	FeatureFlagKarpenter          = "Karpenter"
 	AzureCloudName                = "azure"
 	AWSCloudName                  = "aws"
 	GPUString                     = "gpu"
@@ -20,6 +19,10 @@ const (
 	GiBToBytes                    = 1024 * 1024 * 1024 // Conversion factor from GiB to bytes
 	NvidiaGPU                     = "nvidia.com/gpu"
 
+	// Feature flags
+	FeatureFlagKarpenter = "Karpenter"
+	FeatureFlagVLLM      = "vLLM"
+
 	// Nodeclaim related consts
 	KaitoNodePoolName             = "kaito"
 	LabelNodePool                 = "karpenter.sh/nodepool"
diff --git a/pkg/utils/plugin/plugin.go b/pkg/utils/plugin/plugin.go
index 35706cb9c..62f048265 100644
--- a/pkg/utils/plugin/plugin.go
+++ b/pkg/utils/plugin/plugin.go
@@ -60,3 +60,7 @@ func (reg *ModelRegister) Has(name string) bool {
 	_, ok := reg.models[name]
 	return ok
 }
+
+func IsValidPreset(preset string) bool {
+	return KaitoModelRegister.Has(preset)
+}
diff --git a/pkg/utils/test/testModel.go b/pkg/utils/test/testModel.go
index 2a820191f..e616a5441 100644
--- a/pkg/utils/test/testModel.go
+++ b/pkg/utils/test/testModel.go
@@ -15,8 +15,15 @@ type testModel struct{}
 func (*testModel) GetInferenceParameters() *model.PresetParam {
 	return &model.PresetParam{
 		GPUCountRequirement: "1",
-		ReadinessTimeout:    time.Duration(30) * time.Minute,
-		BaseCommand: "python3",
+		RuntimeParam: model.RuntimeParam{
+			VLLM: model.VLLMParam{
+				BaseCommand: "python3 /workspace/vllm/inference_api.py",
+			},
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand: "accelerate launch",
+			},
+		},
+		ReadinessTimeout: time.Duration(30) * time.Minute,
 	}
 }
 func (*testModel) GetTuningParameters() *model.PresetParam {
@@ -37,8 +44,15 @@ type testDistributedModel struct{}
 func (*testDistributedModel) GetInferenceParameters() *model.PresetParam {
 	return &model.PresetParam{
 		GPUCountRequirement: "1",
-		ReadinessTimeout:    time.Duration(30) * time.Minute,
-		BaseCommand: "python3",
+		RuntimeParam: model.RuntimeParam{
+			VLLM: model.VLLMParam{
+				BaseCommand: "python3 /workspace/vllm/inference_api.py",
+			},
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand: "accelerate launch",
+			},
+		},
+		ReadinessTimeout: time.Duration(30) * time.Minute,
 	}
 }
 func (*testDistributedModel) GetTuningParameters() *model.PresetParam {
diff --git a/pkg/utils/test/testUtils.go b/pkg/utils/test/testUtils.go
index 912cd2bc0..b0f728f6f 100644
--- a/pkg/utils/test/testUtils.go
+++ b/pkg/utils/test/testUtils.go
@@ -6,6 +6,7 @@ package test
 import (
 	"github.com/aws/karpenter-core/pkg/apis/v1alpha5"
 	"github.com/kaito-project/kaito/api/v1alpha1"
+	"github.com/kaito-project/kaito/pkg/model"
 	"github.com/samber/lo"
 	appsv1 "k8s.io/api/apps/v1"
 	corev1 "k8s.io/api/core/v1"
@@ -139,6 +140,31 @@ var (
 			},
 		},
 	}
+	MockWorkspaceWithPresetVLLM = &v1alpha1.Workspace{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      "testWorkspace",
+			Namespace: "kaito",
+			Annotations: map[string]string{
+				v1alpha1.AnnotationWorkspaceRuntime: string(model.RuntimeNameVLLM),
+			},
+		},
+		Resource: v1alpha1.ResourceSpec{
+			Count:        &gpuNodeCount,
+			InstanceType: "Standard_NC12s_v3",
+			LabelSelector: &metav1.LabelSelector{
+				MatchLabels: map[string]string{
+					"apps": "test",
+				},
+			},
+		},
+		Inference: &v1alpha1.InferenceSpec{
+			Preset: &v1alpha1.PresetSpec{
+				PresetMeta: v1alpha1.PresetMeta{
+					Name: "test-model",
+				},
+			},
+		},
+	}
 )
 
 var MockWorkspaceWithPresetHash = "89ae127050ec264a5ce84db48ef7226574cdf1299e6bd27fe90b927e34cc8adb"
diff --git a/pkg/workspace/controllers/workspace_controller.go b/pkg/workspace/controllers/workspace_controller.go
index d5d9eee52..7762a582a 100644
--- a/pkg/workspace/controllers/workspace_controller.go
+++ b/pkg/workspace/controllers/workspace_controller.go
@@ -780,7 +780,7 @@ func (c *WorkspaceReconciler) applyInference(ctx context.Context, wObj *kaitov1a
 			} else if apierrors.IsNotFound(err) {
 				var workloadObj client.Object
 				// Need to create a new workload
-				workloadObj, err = inference.CreatePresetInference(ctx, wObj, revisionStr, inferenceParam, model.SupportDistributedInference(), c.Client)
+				workloadObj, err = inference.CreatePresetInference(ctx, wObj, revisionStr, model, c.Client)
 				if err != nil {
 					return
 				}
diff --git a/pkg/workspace/inference/preset-inference-types.go b/pkg/workspace/inference/preset-inference-types.go
index d3157262e..205d0b4b3 100644
--- a/pkg/workspace/inference/preset-inference-types.go
+++ b/pkg/workspace/inference/preset-inference-types.go
@@ -54,5 +54,8 @@ var (
 		"gpu_ids":       DefaultGPUIds,
 	}
 
+	DefaultVLLMCommand         = "python3 /workspace/vllm/inference_api.py"
+	DefautTransformersMainFile = "/workspace/tfs/inference_api.py"
+
 	DefaultImagePullSecrets = []corev1.LocalObjectReference{}
 )
diff --git a/pkg/workspace/inference/preset-inferences.go b/pkg/workspace/inference/preset-inferences.go
index b36ba9b76..ae4983036 100644
--- a/pkg/workspace/inference/preset-inferences.go
+++ b/pkg/workspace/inference/preset-inferences.go
@@ -11,6 +11,7 @@ import (
 	"github.com/kaito-project/kaito/pkg/utils"
 	"github.com/kaito-project/kaito/pkg/utils/consts"
 
+	"github.com/kaito-project/kaito/api/v1alpha1"
 	kaitov1alpha1 "github.com/kaito-project/kaito/api/v1alpha1"
 	"github.com/kaito-project/kaito/pkg/model"
 	"github.com/kaito-project/kaito/pkg/utils/resources"
@@ -23,9 +24,8 @@ import (
 )
 
 const (
-	ProbePath     = "/healthz"
-	Port5000      = 5000
-	InferenceFile = "inference_api.py"
+	ProbePath = "/health"
+	Port5000  = 5000
 )
 
 var (
@@ -71,7 +71,12 @@ var (
 	}
 )
 
-func updateTorchParamsForDistributedInference(ctx context.Context, kubeClient client.Client, wObj *kaitov1alpha1.Workspace, inferenceObj *model.PresetParam) error {
+func updateTorchParamsForDistributedInference(ctx context.Context, kubeClient client.Client, wObj *kaitov1alpha1.Workspace, inferenceParam *model.PresetParam) error {
+	runtimeName := v1alpha1.GetWorkspaceRuntimeName(wObj)
+	if runtimeName != model.RuntimeNameHuggingfaceTransformers {
+		return fmt.Errorf("distributed inference is not supported for runtime %s", runtimeName)
+	}
+
 	existingService := &corev1.Service{}
 	err := resources.GetResource(ctx, wObj.Name, wObj.Namespace, kubeClient, existingService)
 	if err != nil {
@@ -79,18 +84,18 @@ func updateTorchParamsForDistributedInference(ctx context.Context, kubeClient cl
 	}
 
 	nodes := *wObj.Resource.Count
-	inferenceObj.TorchRunParams["nnodes"] = strconv.Itoa(nodes)
-	inferenceObj.TorchRunParams["nproc_per_node"] = strconv.Itoa(inferenceObj.WorldSize / nodes)
+	inferenceParam.Transformers.TorchRunParams["nnodes"] = strconv.Itoa(nodes)
+	inferenceParam.Transformers.TorchRunParams["nproc_per_node"] = strconv.Itoa(inferenceParam.WorldSize / nodes)
 	if nodes > 1 {
-		inferenceObj.TorchRunParams["node_rank"] = "$(echo $HOSTNAME | grep -o '[^-]*$')"
-		inferenceObj.TorchRunParams["master_addr"] = existingService.Spec.ClusterIP
-		inferenceObj.TorchRunParams["master_port"] = "29500"
-	}
-	if inferenceObj.TorchRunRdzvParams != nil {
-		inferenceObj.TorchRunRdzvParams["max_restarts"] = "3"
-		inferenceObj.TorchRunRdzvParams["rdzv_id"] = "job"
-		inferenceObj.TorchRunRdzvParams["rdzv_backend"] = "c10d"
-		inferenceObj.TorchRunRdzvParams["rdzv_endpoint"] =
+		inferenceParam.Transformers.TorchRunParams["node_rank"] = "$(echo $HOSTNAME | grep -o '[^-]*$')"
+		inferenceParam.Transformers.TorchRunParams["master_addr"] = existingService.Spec.ClusterIP
+		inferenceParam.Transformers.TorchRunParams["master_port"] = "29500"
+	}
+	if inferenceParam.Transformers.TorchRunRdzvParams != nil {
+		inferenceParam.Transformers.TorchRunRdzvParams["max_restarts"] = "3"
+		inferenceParam.Transformers.TorchRunRdzvParams["rdzv_id"] = "job"
+		inferenceParam.Transformers.TorchRunRdzvParams["rdzv_backend"] = "c10d"
+		inferenceParam.Transformers.TorchRunRdzvParams["rdzv_endpoint"] =
 			fmt.Sprintf("%s-0.%s-headless.%s.svc.cluster.local:29500", wObj.Name, wObj.Name, wObj.Namespace)
 	}
 	return nil
@@ -122,14 +127,17 @@ func GetInferenceImageInfo(ctx context.Context, workspaceObj *kaitov1alpha1.Work
 }
 
 func CreatePresetInference(ctx context.Context, workspaceObj *kaitov1alpha1.Workspace, revisionNum string,
-	inferenceObj *model.PresetParam, supportDistributedInference bool, kubeClient client.Client) (client.Object, error) {
-	if inferenceObj.TorchRunParams != nil && supportDistributedInference {
-		if err := updateTorchParamsForDistributedInference(ctx, kubeClient, workspaceObj, inferenceObj); err != nil {
+	model model.Model, kubeClient client.Client) (client.Object, error) {
+	inferenceParam := model.GetInferenceParameters().DeepCopy()
+
+	if model.SupportDistributedInference() {
+		if err := updateTorchParamsForDistributedInference(ctx, kubeClient, workspaceObj, inferenceParam); err != nil { //
 			klog.ErrorS(err, "failed to update torch params", "workspace", workspaceObj)
 			return nil, err
 		}
 	}
 
+	// additional volume
 	var volumes []corev1.Volume
 	var volumeMounts []corev1.VolumeMount
 	shmVolume, shmVolumeMount := utils.ConfigSHMVolume(*workspaceObj.Resource.Count)
@@ -139,24 +147,35 @@ func CreatePresetInference(ctx context.Context, workspaceObj *kaitov1alpha1.Work
 	if shmVolumeMount.Name != "" {
 		volumeMounts = append(volumeMounts, shmVolumeMount)
 	}
-
 	if len(workspaceObj.Inference.Adapters) > 0 {
 		adapterVolume, adapterVolumeMount := utils.ConfigAdapterVolume()
 		volumes = append(volumes, adapterVolume)
 		volumeMounts = append(volumeMounts, adapterVolumeMount)
 	}
 
+	// resource requirements
 	skuNumGPUs, err := utils.GetSKUNumGPUs(ctx, kubeClient, workspaceObj.Status.WorkerNodes,
-		workspaceObj.Resource.InstanceType, inferenceObj.GPUCountRequirement)
+		workspaceObj.Resource.InstanceType, inferenceParam.GPUCountRequirement)
 	if err != nil {
 		return nil, fmt.Errorf("failed to get SKU num GPUs: %v", err)
 	}
+	resourceReq := corev1.ResourceRequirements{
+		Requests: corev1.ResourceList{
+			corev1.ResourceName(resources.CapacityNvidiaGPU): resource.MustParse(skuNumGPUs),
+		},
+		Limits: corev1.ResourceList{
+			corev1.ResourceName(resources.CapacityNvidiaGPU): resource.MustParse(skuNumGPUs),
+		},
+	}
 
-	commands, resourceReq := prepareInferenceParameters(ctx, inferenceObj, skuNumGPUs)
-	image, imagePullSecrets := GetInferenceImageInfo(ctx, workspaceObj, inferenceObj)
+	// inference command
+	runtimeName := kaitov1alpha1.GetWorkspaceRuntimeName(workspaceObj)
+	commands := inferenceParam.GetInferenceCommand(runtimeName, skuNumGPUs)
+
+	image, imagePullSecrets := GetInferenceImageInfo(ctx, workspaceObj, inferenceParam)
 
 	var depObj client.Object
-	if supportDistributedInference {
+	if model.SupportDistributedInference() {
 		depObj = manifests.GenerateStatefulSetManifest(ctx, workspaceObj, image, imagePullSecrets, *workspaceObj.Resource.Count, commands,
 			containerPorts, livenessProbe, readinessProbe, resourceReq, tolerations, volumes, volumeMounts)
 	} else {
@@ -169,25 +188,3 @@ func CreatePresetInference(ctx context.Context, workspaceObj *kaitov1alpha1.Work
 	}
 	return depObj, nil
 }
-
-// prepareInferenceParameters builds a PyTorch command:
-// torchrun <TORCH_PARAMS> <OPTIONAL_RDZV_PARAMS> baseCommand <MODEL_PARAMS>
-// and sets the GPU resources required for inference.
-// Returns the command and resource configuration.
-func prepareInferenceParameters(ctx context.Context, inferenceObj *model.PresetParam, skuNumGPUs string) ([]string, corev1.ResourceRequirements) {
-	torchCommand := utils.BuildCmdStr(inferenceObj.BaseCommand, inferenceObj.TorchRunParams)
-	torchCommand = utils.BuildCmdStr(torchCommand, inferenceObj.TorchRunRdzvParams)
-	modelCommand := utils.BuildCmdStr(InferenceFile, inferenceObj.ModelRunParams)
-	commands := utils.ShellCmd(torchCommand + " " + modelCommand)
-
-	resourceRequirements := corev1.ResourceRequirements{
-		Requests: corev1.ResourceList{
-			corev1.ResourceName(resources.CapacityNvidiaGPU): resource.MustParse(skuNumGPUs),
-		},
-		Limits: corev1.ResourceList{
-			corev1.ResourceName(resources.CapacityNvidiaGPU): resource.MustParse(skuNumGPUs),
-		},
-	}
-
-	return commands, resourceRequirements
-}
diff --git a/pkg/workspace/inference/preset-inferences_test.go b/pkg/workspace/inference/preset-inferences_test.go
index bebbab4e7..abd3f5385 100644
--- a/pkg/workspace/inference/preset-inferences_test.go
+++ b/pkg/workspace/inference/preset-inferences_test.go
@@ -10,12 +10,10 @@ import (
 	"strings"
 	"testing"
 
-	"github.com/kaito-project/kaito/pkg/utils/consts"
-
 	"github.com/kaito-project/kaito/api/v1alpha1"
+	"github.com/kaito-project/kaito/pkg/utils/consts"
 	"github.com/kaito-project/kaito/pkg/utils/test"
 
-	"github.com/kaito-project/kaito/pkg/model"
 	"github.com/kaito-project/kaito/pkg/utils/plugin"
 	"github.com/stretchr/testify/mock"
 	appsv1 "k8s.io/api/apps/v1"
@@ -28,6 +26,7 @@ var ValidStrength string = "0.5"
 func TestCreatePresetInference(t *testing.T) {
 	test.RegisterTestModel()
 	testcases := map[string]struct {
+		workspace      *v1alpha1.Workspace
 		nodeCount      int
 		modelName      string
 		callMocks      func(c *test.MockClient)
@@ -37,7 +36,35 @@ func TestCreatePresetInference(t *testing.T) {
 		expectedVolume string
 	}{
 
-		"test-model": {
+		"test-model/vllm": {
+			workspace: test.MockWorkspaceWithPresetVLLM,
+			nodeCount: 1,
+			modelName: "test-model",
+			callMocks: func(c *test.MockClient) {
+				c.On("Create", mock.IsType(context.TODO()), mock.IsType(&appsv1.Deployment{}), mock.Anything).Return(nil)
+			},
+			workload: "Deployment",
+			// No BaseCommand, TorchRunParams, TorchRunRdzvParams, or ModelRunParams
+			// So expected cmd consists of shell command and inference file
+			expectedCmd: "/bin/sh -c python3 /workspace/vllm/inference_api.py --tensor-parallel-size=2",
+			hasAdapters: false,
+		},
+
+		"test-model-with-adapters/vllm": {
+			workspace: test.MockWorkspaceWithPresetVLLM,
+			nodeCount: 1,
+			modelName: "test-model",
+			callMocks: func(c *test.MockClient) {
+				c.On("Create", mock.IsType(context.TODO()), mock.IsType(&appsv1.Deployment{}), mock.Anything).Return(nil)
+			},
+			workload:       "Deployment",
+			expectedCmd:    "/bin/sh -c python3 /workspace/vllm/inference_api.py --tensor-parallel-size=2",
+			hasAdapters:    true,
+			expectedVolume: "adapter-volume",
+		},
+
+		"test-model/transformers": {
+			workspace: test.MockWorkspaceWithPreset,
 			nodeCount: 1,
 			modelName: "test-model",
 			callMocks: func(c *test.MockClient) {
@@ -46,11 +73,12 @@ func TestCreatePresetInference(t *testing.T) {
 			workload: "Deployment",
 			// No BaseCommand, TorchRunParams, TorchRunRdzvParams, or ModelRunParams
 			// So expected cmd consists of shell command and inference file
-			expectedCmd: "/bin/sh -c python3 inference_api.py",
+			expectedCmd: "/bin/sh -c accelerate launch /workspace/tfs/inference_api.py",
 			hasAdapters: false,
 		},
 
-		"test-distributed-model": {
+		"test-distributed-model/transformers": {
+			workspace: test.MockWorkspaceDistributedModel,
 			nodeCount: 1,
 			modelName: "test-distributed-model",
 			callMocks: func(c *test.MockClient) {
@@ -58,18 +86,19 @@ func TestCreatePresetInference(t *testing.T) {
 				c.On("Create", mock.IsType(context.TODO()), mock.IsType(&appsv1.StatefulSet{}), mock.Anything).Return(nil)
 			},
 			workload:    "StatefulSet",
-			expectedCmd: "/bin/sh -c python3 inference_api.py",
+			expectedCmd: "/bin/sh -c accelerate launch --nnodes=1 --nproc_per_node=0 --max_restarts=3 --rdzv_id=job --rdzv_backend=c10d --rdzv_endpoint=testWorkspace-0.testWorkspace-headless.kaito.svc.cluster.local:29500 /workspace/tfs/inference_api.py",
 			hasAdapters: false,
 		},
 
 		"test-model-with-adapters": {
+			workspace: test.MockWorkspaceWithPreset,
 			nodeCount: 1,
 			modelName: "test-model",
 			callMocks: func(c *test.MockClient) {
 				c.On("Create", mock.IsType(context.TODO()), mock.IsType(&appsv1.Deployment{}), mock.Anything).Return(nil)
 			},
 			workload:       "Deployment",
-			expectedCmd:    "/bin/sh -c python3 inference_api.py",
+			expectedCmd:    "/bin/sh -c accelerate launch /workspace/tfs/inference_api.py",
 			hasAdapters:    true,
 			expectedVolume: "adapter-volume",
 		},
@@ -81,7 +110,7 @@ func TestCreatePresetInference(t *testing.T) {
 			mockClient := test.NewClient()
 			tc.callMocks(mockClient)
 
-			workspace := test.MockWorkspaceWithPreset
+			workspace := tc.workspace
 			workspace.Resource.Count = &tc.nodeCount
 			expectedSecrets := []string{"fake-secret"}
 			if tc.hasAdapters {
@@ -97,15 +126,8 @@ func TestCreatePresetInference(t *testing.T) {
 				}
 			}
 
-			useHeadlessSvc := false
-
-			var inferenceObj *model.PresetParam
 			model := plugin.KaitoModelRegister.MustGet(tc.modelName)
-			inferenceObj = model.GetInferenceParameters()
 
-			if strings.Contains(tc.modelName, "distributed") {
-				useHeadlessSvc = true
-			}
 			svc := &corev1.Service{
 				ObjectMeta: v1.ObjectMeta{
 					Name:      workspace.Name,
@@ -117,7 +139,7 @@ func TestCreatePresetInference(t *testing.T) {
 			}
 			mockClient.CreateOrUpdateObjectInMap(svc)
 
-			createdObject, _ := CreatePresetInference(context.TODO(), workspace, test.MockWorkspaceWithPresetHash, inferenceObj, useHeadlessSvc, mockClient)
+			createdObject, _ := CreatePresetInference(context.TODO(), workspace, test.MockWorkspaceWithPresetHash, model, mockClient)
 			createdWorkload := ""
 			switch createdObject.(type) {
 			case *appsv1.Deployment:
diff --git a/pkg/workspace/tuning/preset-tuning.go b/pkg/workspace/tuning/preset-tuning.go
index 5703b1845..2707582aa 100644
--- a/pkg/workspace/tuning/preset-tuning.go
+++ b/pkg/workspace/tuning/preset-tuning.go
@@ -491,7 +491,7 @@ func handleURLDataSource(ctx context.Context, workspaceObj *kaitov1alpha1.Worksp
 }
 
 func prepareModelRunParameters(ctx context.Context, tuningObj *model.PresetParam) (string, error) {
-	modelCommand := utils.BuildCmdStr(TuningFile, tuningObj.ModelRunParams)
+	modelCommand := utils.BuildCmdStr(TuningFile, tuningObj.Transformers.ModelRunParams)
 	return modelCommand, nil
 }
 
@@ -501,14 +501,14 @@ func prepareModelRunParameters(ctx context.Context, tuningObj *model.PresetParam
 // Returns the command and resource configuration.
 func prepareTuningParameters(ctx context.Context, wObj *kaitov1alpha1.Workspace, modelCommand string,
 	tuningObj *model.PresetParam, skuNumGPUs string) ([]string, corev1.ResourceRequirements) {
-	if tuningObj.TorchRunParams == nil {
-		tuningObj.TorchRunParams = make(map[string]string)
+	hfParam := tuningObj.Transformers // Only support Huggingface for now
+	if hfParam.TorchRunParams == nil {
+		hfParam.TorchRunParams = make(map[string]string)
 	}
 	// Set # of processes to GPU Count
 	numProcesses := getInstanceGPUCount(wObj.Resource.InstanceType)
-	tuningObj.TorchRunParams["num_processes"] = fmt.Sprintf("%d", numProcesses)
-	torchCommand := utils.BuildCmdStr(tuningObj.BaseCommand, tuningObj.TorchRunParams)
-	torchCommand = utils.BuildCmdStr(torchCommand, tuningObj.TorchRunRdzvParams)
+	hfParam.TorchRunParams["num_processes"] = fmt.Sprintf("%d", numProcesses)
+	torchCommand := utils.BuildCmdStr(hfParam.BaseCommand, hfParam.TorchRunParams, hfParam.TorchRunRdzvParams)
 	commands := utils.ShellCmd(torchCommand + " " + modelCommand)
 
 	resourceRequirements := corev1.ResourceRequirements{
diff --git a/pkg/workspace/tuning/preset-tuning_test.go b/pkg/workspace/tuning/preset-tuning_test.go
index 6e1ede930..1522595e6 100644
--- a/pkg/workspace/tuning/preset-tuning_test.go
+++ b/pkg/workspace/tuning/preset-tuning_test.go
@@ -416,9 +416,13 @@ func TestPrepareTuningParameters(t *testing.T) {
 			},
 			modelCommand: "model-command",
 			tuningObj: &model.PresetParam{
-				BaseCommand:         "python train.py",
-				TorchRunParams:      map[string]string{},
-				TorchRunRdzvParams:  map[string]string{},
+				RuntimeParam: model.RuntimeParam{
+					Transformers: model.HuggingfaceTransformersParam{
+						BaseCommand:        "python train.py",
+						TorchRunParams:     map[string]string{},
+						TorchRunRdzvParams: map[string]string{},
+					},
+				},
 				GPUCountRequirement: "2",
 			},
 			expectedCommands: []string{"/bin/sh", "-c", "python train.py --num_processes=1 model-command"},
diff --git a/presets/workspace/models/falcon/model.go b/presets/workspace/models/falcon/model.go
index 88487fd63..d78a22b9d 100644
--- a/presets/workspace/models/falcon/model.go
+++ b/presets/workspace/models/falcon/model.go
@@ -38,17 +38,22 @@ var (
 	PresetFalcon40BInstructModel = PresetFalcon40BModel + "-instruct"
 
 	PresetFalconTagMap = map[string]string{
-		"Falcon7B":          "0.0.6",
-		"Falcon7BInstruct":  "0.0.6",
-		"Falcon40B":         "0.0.7",
-		"Falcon40BInstruct": "0.0.7",
+		"Falcon7B":          "0.0.7",
+		"Falcon7BInstruct":  "0.0.7",
+		"Falcon40B":         "0.0.8",
+		"Falcon40BInstruct": "0.0.8",
 	}
 
 	baseCommandPresetFalconInference = "accelerate launch"
-	baseCommandPresetFalconTuning    = "python3 metrics_server.py & accelerate launch"
+	baseCommandPresetFalconTuning    = "cd /workspace/tfs/ && python3 metrics_server.py & accelerate launch"
 	falconRunParams                  = map[string]string{
-		"torch_dtype": "bfloat16",
-		"pipeline":    "text-generation",
+		"torch_dtype":   "bfloat16",
+		"pipeline":      "text-generation",
+		"chat_template": "/workspace/chat_templates/falcon-instruct.jinja",
+	}
+	falconRunParamsVLLM = map[string]string{
+		"dtype":         "bfloat16",
+		"chat-template": "/workspace/chat_templates/falcon-instruct.jinja",
 	}
 )
 
@@ -64,11 +69,21 @@ func (*falcon7b) GetInferenceParameters() *model.PresetParam {
 		GPUCountRequirement:       "1",
 		TotalGPUMemoryRequirement: "14Gi",
 		PerGPUMemoryRequirement:   "0Gi", // We run Falcon using native vertical model parallel, no per GPU memory requirement.
-		TorchRunParams:            inference.DefaultAccelerateParams,
-		ModelRunParams:            falconRunParams,
-		ReadinessTimeout:          time.Duration(30) * time.Minute,
-		BaseCommand:               baseCommandPresetFalconInference,
-		Tag:                       PresetFalconTagMap["Falcon7B"],
+		RuntimeParam: model.RuntimeParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand:       baseCommandPresetFalconInference,
+				TorchRunParams:    inference.DefaultAccelerateParams,
+				InferenceMainFile: inference.DefautTransformersMainFile,
+				ModelRunParams:    falconRunParams,
+			},
+			VLLM: model.VLLMParam{
+				BaseCommand:    inference.DefaultVLLMCommand,
+				ModelName:      "falcon-7b",
+				ModelRunParams: falconRunParamsVLLM,
+			},
+		},
+		ReadinessTimeout: time.Duration(30) * time.Minute,
+		Tag:              PresetFalconTagMap["Falcon7B"],
 	}
 }
 func (*falcon7b) GetTuningParameters() *model.PresetParam {
@@ -79,10 +94,14 @@ func (*falcon7b) GetTuningParameters() *model.PresetParam {
 		GPUCountRequirement:       "1",
 		TotalGPUMemoryRequirement: "16Gi",
 		PerGPUMemoryRequirement:   "16Gi",
-		TorchRunParams:            tuning.DefaultAccelerateParams,
-		//ModelRunPrams:             falconRunTuningParams, // TODO
+		RuntimeParam: model.RuntimeParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand:    baseCommandPresetFalconTuning,
+				TorchRunParams: tuning.DefaultAccelerateParams,
+				//ModelRunPrams:             falconRunTuningParams, // TODO
+			},
+		},
 		ReadinessTimeout:              time.Duration(30) * time.Minute,
-		BaseCommand:                   baseCommandPresetFalconTuning,
 		Tag:                           PresetFalconTagMap["Falcon7B"],
 		TuningPerGPUMemoryRequirement: map[string]int{"qlora": 16},
 	}
@@ -107,11 +126,21 @@ func (*falcon7bInst) GetInferenceParameters() *model.PresetParam {
 		GPUCountRequirement:       "1",
 		TotalGPUMemoryRequirement: "14Gi",
 		PerGPUMemoryRequirement:   "0Gi", // We run Falcon using native vertical model parallel, no per GPU memory requirement.
-		TorchRunParams:            inference.DefaultAccelerateParams,
-		ModelRunParams:            falconRunParams,
-		ReadinessTimeout:          time.Duration(30) * time.Minute,
-		BaseCommand:               baseCommandPresetFalconInference,
-		Tag:                       PresetFalconTagMap["Falcon7BInstruct"],
+		RuntimeParam: model.RuntimeParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand:       baseCommandPresetFalconInference,
+				TorchRunParams:    inference.DefaultAccelerateParams,
+				InferenceMainFile: inference.DefautTransformersMainFile,
+				ModelRunParams:    falconRunParams,
+			},
+			VLLM: model.VLLMParam{
+				BaseCommand:    inference.DefaultVLLMCommand,
+				ModelName:      "falcon-7b-instruct",
+				ModelRunParams: falconRunParamsVLLM,
+			},
+		},
+		ReadinessTimeout: time.Duration(30) * time.Minute,
+		Tag:              PresetFalconTagMap["Falcon7BInstruct"],
 	}
 
 }
@@ -137,13 +166,22 @@ func (*falcon40b) GetInferenceParameters() *model.PresetParam {
 		GPUCountRequirement:       "2",
 		TotalGPUMemoryRequirement: "90Gi",
 		PerGPUMemoryRequirement:   "0Gi", // We run Falcon using native vertical model parallel, no per GPU memory requirement.
-		TorchRunParams:            inference.DefaultAccelerateParams,
-		ModelRunParams:            falconRunParams,
-		ReadinessTimeout:          time.Duration(30) * time.Minute,
-		BaseCommand:               baseCommandPresetFalconInference,
-		Tag:                       PresetFalconTagMap["Falcon40B"],
+		RuntimeParam: model.RuntimeParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand:       baseCommandPresetFalconInference,
+				TorchRunParams:    inference.DefaultAccelerateParams,
+				InferenceMainFile: inference.DefautTransformersMainFile,
+				ModelRunParams:    falconRunParams,
+			},
+			VLLM: model.VLLMParam{
+				BaseCommand:    inference.DefaultVLLMCommand,
+				ModelName:      "falcon-40b",
+				ModelRunParams: falconRunParamsVLLM,
+			},
+		},
+		ReadinessTimeout: time.Duration(30) * time.Minute,
+		Tag:              PresetFalconTagMap["Falcon40B"],
 	}
-
 }
 func (*falcon40b) GetTuningParameters() *model.PresetParam {
 	return &model.PresetParam{
@@ -153,10 +191,14 @@ func (*falcon40b) GetTuningParameters() *model.PresetParam {
 		GPUCountRequirement:       "2",
 		TotalGPUMemoryRequirement: "90Gi",
 		PerGPUMemoryRequirement:   "16Gi",
-		TorchRunParams:            tuning.DefaultAccelerateParams,
-		//ModelRunPrams:             falconRunTuningParams, // TODO
+		RuntimeParam: model.RuntimeParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand:    baseCommandPresetFalconTuning,
+				TorchRunParams: tuning.DefaultAccelerateParams,
+				//ModelRunPrams:             falconRunTuningParams, // TODO
+			},
+		},
 		ReadinessTimeout: time.Duration(30) * time.Minute,
-		BaseCommand:      baseCommandPresetFalconTuning,
 		Tag:              PresetFalconTagMap["Falcon40B"],
 	}
 }
@@ -179,11 +221,21 @@ func (*falcon40bInst) GetInferenceParameters() *model.PresetParam {
 		GPUCountRequirement:       "2",
 		TotalGPUMemoryRequirement: "90Gi",
 		PerGPUMemoryRequirement:   "0Gi", // We run Falcon using native vertical model parallel, no per GPU memory requirement.
-		TorchRunParams:            inference.DefaultAccelerateParams,
-		ModelRunParams:            falconRunParams,
-		ReadinessTimeout:          time.Duration(30) * time.Minute,
-		BaseCommand:               baseCommandPresetFalconInference,
-		Tag:                       PresetFalconTagMap["Falcon40BInstruct"],
+		RuntimeParam: model.RuntimeParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand:       baseCommandPresetFalconInference,
+				TorchRunParams:    inference.DefaultAccelerateParams,
+				InferenceMainFile: inference.DefautTransformersMainFile,
+				ModelRunParams:    falconRunParams,
+			},
+			VLLM: model.VLLMParam{
+				BaseCommand:    inference.DefaultVLLMCommand,
+				ModelName:      "falcon-40b-instruct",
+				ModelRunParams: falconRunParamsVLLM,
+			},
+		},
+		ReadinessTimeout: time.Duration(30) * time.Minute,
+		Tag:              PresetFalconTagMap["Falcon40BInstruct"],
 	}
 }
 func (*falcon40bInst) GetTuningParameters() *model.PresetParam {
diff --git a/presets/workspace/models/llama2/model.go b/presets/workspace/models/llama2/model.go
index 7a81d679f..a9fb5a247 100644
--- a/presets/workspace/models/llama2/model.go
+++ b/presets/workspace/models/llama2/model.go
@@ -46,15 +46,19 @@ func (*llama2Text7b) GetInferenceParameters() *model.PresetParam {
 		GPUCountRequirement:       "1",
 		TotalGPUMemoryRequirement: "14Gi",
 		PerGPUMemoryRequirement:   "14Gi", // We run llama2 using tensor parallelism, the memory of each GPU needs to be bigger than the tensor shard size.
-		TorchRunParams:            inference.DefaultTorchRunParams,
-		TorchRunRdzvParams:        inference.DefaultTorchRunRdzvParams,
-		ModelRunParams:            llamaRunParams,
-		ReadinessTimeout:          time.Duration(10) * time.Minute,
-		BaseCommand:               baseCommandPresetLlama,
-		WorldSize:                 1,
+		RuntimeParam: model.RuntimeParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand:        baseCommandPresetLlama,
+				TorchRunParams:     inference.DefaultTorchRunParams,
+				TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams,
+				InferenceMainFile:  "inference_api.py",
+				ModelRunParams:     llamaRunParams,
+			},
+		},
+		ReadinessTimeout: time.Duration(10) * time.Minute,
+		WorldSize:        1,
 		// Tag:  llama has private image access mode. The image tag is determined by the user.
 	}
-
 }
 func (*llama2Text7b) GetTuningParameters() *model.PresetParam {
 	return nil // Currently doesn't support fine-tuning
@@ -78,12 +82,17 @@ func (*llama2Text13b) GetInferenceParameters() *model.PresetParam {
 		GPUCountRequirement:       "2",
 		TotalGPUMemoryRequirement: "30Gi",
 		PerGPUMemoryRequirement:   "15Gi", // We run llama2 using tensor parallelism, the memory of each GPU needs to be bigger than the tensor shard size.
-		TorchRunParams:            inference.DefaultTorchRunParams,
-		TorchRunRdzvParams:        inference.DefaultTorchRunRdzvParams,
-		ModelRunParams:            llamaRunParams,
-		ReadinessTimeout:          time.Duration(20) * time.Minute,
-		BaseCommand:               baseCommandPresetLlama,
-		WorldSize:                 2,
+		RuntimeParam: model.RuntimeParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand:        baseCommandPresetLlama,
+				TorchRunParams:     inference.DefaultTorchRunParams,
+				TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams,
+				InferenceMainFile:  "inference_api.py",
+				ModelRunParams:     llamaRunParams,
+			},
+		},
+		ReadinessTimeout: time.Duration(20) * time.Minute,
+		WorldSize:        2,
 		// Tag:  llama has private image access mode. The image tag is determined by the user.
 	}
 }
@@ -109,12 +118,17 @@ func (*llama2Text70b) GetInferenceParameters() *model.PresetParam {
 		GPUCountRequirement:       "8",
 		TotalGPUMemoryRequirement: "152Gi",
 		PerGPUMemoryRequirement:   "19Gi", // We run llama2 using tensor parallelism, the memory of each GPU needs to be bigger than the tensor shard size.
-		TorchRunParams:            inference.DefaultTorchRunParams,
-		TorchRunRdzvParams:        inference.DefaultTorchRunRdzvParams,
-		ModelRunParams:            llamaRunParams,
-		ReadinessTimeout:          time.Duration(30) * time.Minute,
-		BaseCommand:               baseCommandPresetLlama,
-		WorldSize:                 8,
+		RuntimeParam: model.RuntimeParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand:        baseCommandPresetLlama,
+				TorchRunParams:     inference.DefaultTorchRunParams,
+				TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams,
+				InferenceMainFile:  "inference_api.py",
+				ModelRunParams:     llamaRunParams,
+			},
+		},
+		ReadinessTimeout: time.Duration(30) * time.Minute,
+		WorldSize:        8,
 		// Tag:  llama has private image access mode. The image tag is determined by the user.
 	}
 }
diff --git a/presets/workspace/models/llama2chat/model.go b/presets/workspace/models/llama2chat/model.go
index 735ecc013..431d04b0c 100644
--- a/presets/workspace/models/llama2chat/model.go
+++ b/presets/workspace/models/llama2chat/model.go
@@ -46,12 +46,17 @@ func (*llama2Chat7b) GetInferenceParameters() *model.PresetParam {
 		GPUCountRequirement:       "1",
 		TotalGPUMemoryRequirement: "16Gi",
 		PerGPUMemoryRequirement:   "14Gi", // We run llama2 using tensor parallelism, the memory of each GPU needs to be bigger than the tensor shard size.
-		TorchRunParams:            inference.DefaultTorchRunParams,
-		TorchRunRdzvParams:        inference.DefaultTorchRunRdzvParams,
-		ModelRunParams:            llamaRunParams,
-		ReadinessTimeout:          time.Duration(10) * time.Minute,
-		BaseCommand:               baseCommandPresetLlama,
-		WorldSize:                 1,
+		RuntimeParam: model.RuntimeParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand:        baseCommandPresetLlama,
+				TorchRunParams:     inference.DefaultTorchRunParams,
+				TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams,
+				InferenceMainFile:  "inference_api.py",
+				ModelRunParams:     llamaRunParams,
+			},
+		},
+		ReadinessTimeout: time.Duration(10) * time.Minute,
+		WorldSize:        1,
 		// Tag:  llama has private image access mode. The image tag is determined by the user.
 	}
 }
@@ -77,12 +82,18 @@ func (*llama2Chat13b) GetInferenceParameters() *model.PresetParam {
 		GPUCountRequirement:       "2",
 		TotalGPUMemoryRequirement: "30Gi",
 		PerGPUMemoryRequirement:   "15Gi", // We run llama2 using tensor parallelism, the memory of each GPU needs to be bigger than the tensor shard size.
-		TorchRunParams:            inference.DefaultTorchRunParams,
-		TorchRunRdzvParams:        inference.DefaultTorchRunRdzvParams,
-		ModelRunParams:            llamaRunParams,
-		ReadinessTimeout:          time.Duration(20) * time.Minute,
-		BaseCommand:               baseCommandPresetLlama,
-		WorldSize:                 2,
+		RuntimeParam: model.RuntimeParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand:        baseCommandPresetLlama,
+				TorchRunParams:     inference.DefaultTorchRunParams,
+				TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams,
+				InferenceMainFile:  "inference_api.py",
+				ModelRunParams:     llamaRunParams,
+			},
+		},
+		ReadinessTimeout: time.Duration(20) * time.Minute,
+
+		WorldSize: 2,
 		// Tag:  llama has private image access mode. The image tag is determined by the user.
 	}
 }
@@ -108,12 +119,17 @@ func (*llama2Chat70b) GetInferenceParameters() *model.PresetParam {
 		GPUCountRequirement:       "8",
 		TotalGPUMemoryRequirement: "192Gi",
 		PerGPUMemoryRequirement:   "19Gi", // We run llama2 using tensor parallelism, the memory of each GPU needs to be bigger than the tensor shard size.
-		TorchRunParams:            inference.DefaultTorchRunParams,
-		TorchRunRdzvParams:        inference.DefaultTorchRunRdzvParams,
-		ModelRunParams:            llamaRunParams,
-		ReadinessTimeout:          time.Duration(30) * time.Minute,
-		BaseCommand:               baseCommandPresetLlama,
-		WorldSize:                 8,
+		RuntimeParam: model.RuntimeParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand:        baseCommandPresetLlama,
+				TorchRunParams:     inference.DefaultTorchRunParams,
+				TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams,
+				InferenceMainFile:  "inference_api.py",
+				ModelRunParams:     llamaRunParams,
+			},
+		},
+		ReadinessTimeout: time.Duration(30) * time.Minute,
+		WorldSize:        8,
 		// Tag:  llama has private image access mode. The image tag is determined by the user.
 	}
 }
diff --git a/presets/workspace/models/mistral/model.go b/presets/workspace/models/mistral/model.go
index 51a23b10c..54cc4ecde 100644
--- a/presets/workspace/models/mistral/model.go
+++ b/presets/workspace/models/mistral/model.go
@@ -27,15 +27,20 @@ var (
 	PresetMistral7BInstructModel = PresetMistral7BModel + "-instruct"
 
 	PresetMistralTagMap = map[string]string{
-		"Mistral7B":         "0.0.7",
-		"Mistral7BInstruct": "0.0.7",
+		"Mistral7B":         "0.0.8",
+		"Mistral7BInstruct": "0.0.8",
 	}
 
 	baseCommandPresetMistralInference = "accelerate launch"
-	baseCommandPresetMistralTuning    = "python3 metrics_server.py & accelerate launch"
+	baseCommandPresetMistralTuning    = "cd /workspace/tfs/ && python3 metrics_server.py & accelerate launch"
 	mistralRunParams                  = map[string]string{
-		"torch_dtype": "bfloat16",
-		"pipeline":    "text-generation",
+		"torch_dtype":   "bfloat16",
+		"pipeline":      "text-generation",
+		"chat_template": "/workspace/chat_templates/mistral-instruct.jinja",
+	}
+	mistralRunParamsVLLM = map[string]string{
+		"dtype":         "bfloat16",
+		"chat-template": "/workspace/chat_templates/mistral-instruct.jinja",
 	}
 )
 
@@ -51,11 +56,21 @@ func (*mistral7b) GetInferenceParameters() *model.PresetParam {
 		GPUCountRequirement:       "1",
 		TotalGPUMemoryRequirement: "14Gi",
 		PerGPUMemoryRequirement:   "0Gi", // We run Mistral using native vertical model parallel, no per GPU memory requirement.
-		TorchRunParams:            inference.DefaultAccelerateParams,
-		ModelRunParams:            mistralRunParams,
-		ReadinessTimeout:          time.Duration(30) * time.Minute,
-		BaseCommand:               baseCommandPresetMistralInference,
-		Tag:                       PresetMistralTagMap["Mistral7B"],
+		RuntimeParam: model.RuntimeParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				TorchRunParams:    inference.DefaultAccelerateParams,
+				ModelRunParams:    mistralRunParams,
+				BaseCommand:       baseCommandPresetMistralInference,
+				InferenceMainFile: inference.DefautTransformersMainFile,
+			},
+			VLLM: model.VLLMParam{
+				BaseCommand:    inference.DefaultVLLMCommand,
+				ModelName:      "mistral-7b",
+				ModelRunParams: mistralRunParamsVLLM,
+			},
+		},
+		ReadinessTimeout: time.Duration(30) * time.Minute,
+		Tag:              PresetMistralTagMap["Mistral7B"],
 	}
 
 }
@@ -67,10 +82,14 @@ func (*mistral7b) GetTuningParameters() *model.PresetParam {
 		GPUCountRequirement:       "1",
 		TotalGPUMemoryRequirement: "16Gi",
 		PerGPUMemoryRequirement:   "16Gi",
-		//TorchRunParams:            tuning.DefaultAccelerateParams,
-		//ModelRunParams:            mistralRunParams,
+		RuntimeParam: model.RuntimeParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				//TorchRunParams:            tuning.DefaultAccelerateParams,
+				//ModelRunParams:            mistralRunParams,
+				BaseCommand: inference.DefaultVLLMCommand,
+			},
+		},
 		ReadinessTimeout: time.Duration(30) * time.Minute,
-		BaseCommand:      baseCommandPresetMistralTuning,
 		Tag:              PresetMistralTagMap["Mistral7B"],
 	}
 }
@@ -94,11 +113,21 @@ func (*mistral7bInst) GetInferenceParameters() *model.PresetParam {
 		GPUCountRequirement:       "1",
 		TotalGPUMemoryRequirement: "16Gi",
 		PerGPUMemoryRequirement:   "0Gi", // We run mistral using native vertical model parallel, no per GPU memory requirement.
-		TorchRunParams:            inference.DefaultAccelerateParams,
-		ModelRunParams:            mistralRunParams,
-		ReadinessTimeout:          time.Duration(30) * time.Minute,
-		BaseCommand:               baseCommandPresetMistralInference,
-		Tag:                       PresetMistralTagMap["Mistral7BInstruct"],
+		RuntimeParam: model.RuntimeParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				TorchRunParams:    inference.DefaultAccelerateParams,
+				ModelRunParams:    mistralRunParams,
+				BaseCommand:       baseCommandPresetMistralInference,
+				InferenceMainFile: inference.DefautTransformersMainFile,
+			},
+			VLLM: model.VLLMParam{
+				BaseCommand:    inference.DefaultVLLMCommand,
+				ModelName:      "mistral-7b-instruct",
+				ModelRunParams: mistralRunParamsVLLM,
+			},
+		},
+		ReadinessTimeout: time.Duration(30) * time.Minute,
+		Tag:              PresetMistralTagMap["Mistral7BInstruct"],
 	}
 
 }
diff --git a/presets/workspace/models/phi2/model.go b/presets/workspace/models/phi2/model.go
index 31f495c7e..eeb305fe5 100644
--- a/presets/workspace/models/phi2/model.go
+++ b/presets/workspace/models/phi2/model.go
@@ -22,15 +22,18 @@ var (
 	PresetPhi2Model = "phi-2"
 
 	PresetPhiTagMap = map[string]string{
-		"Phi2": "0.0.5",
+		"Phi2": "0.0.6",
 	}
 
 	baseCommandPresetPhiInference = "accelerate launch"
-	baseCommandPresetPhiTuning    = "python3 metrics_server.py & accelerate launch"
+	baseCommandPresetPhiTuning    = "cd /workspace/tfs/ && python3 metrics_server.py & accelerate launch"
 	phiRunParams                  = map[string]string{
 		"torch_dtype": "float16",
 		"pipeline":    "text-generation",
 	}
+	phiRunParamsVLLM = map[string]string{
+		"dtype": "float16",
+	}
 )
 
 var phiA phi2
@@ -45,11 +48,21 @@ func (*phi2) GetInferenceParameters() *model.PresetParam {
 		GPUCountRequirement:       "1",
 		TotalGPUMemoryRequirement: "12Gi",
 		PerGPUMemoryRequirement:   "0Gi", // We run Phi using native vertical model parallel, no per GPU memory requirement.
-		TorchRunParams:            inference.DefaultAccelerateParams,
-		ModelRunParams:            phiRunParams,
-		ReadinessTimeout:          time.Duration(30) * time.Minute,
-		BaseCommand:               baseCommandPresetPhiInference,
-		Tag:                       PresetPhiTagMap["Phi2"],
+		RuntimeParam: model.RuntimeParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				TorchRunParams:    inference.DefaultAccelerateParams,
+				ModelRunParams:    phiRunParams,
+				BaseCommand:       baseCommandPresetPhiInference,
+				InferenceMainFile: inference.DefautTransformersMainFile,
+			},
+			VLLM: model.VLLMParam{
+				BaseCommand:    inference.DefaultVLLMCommand,
+				ModelName:      "phi-2",
+				ModelRunParams: phiRunParamsVLLM,
+			},
+		},
+		ReadinessTimeout: time.Duration(30) * time.Minute,
+		Tag:              PresetPhiTagMap["Phi2"],
 	}
 }
 func (*phi2) GetTuningParameters() *model.PresetParam {
@@ -60,10 +73,14 @@ func (*phi2) GetTuningParameters() *model.PresetParam {
 		GPUCountRequirement:       "1",
 		TotalGPUMemoryRequirement: "16Gi",
 		PerGPUMemoryRequirement:   "16Gi",
-		// TorchRunParams:            inference.DefaultAccelerateParams,
-		// ModelRunParams:            phiRunParams,
+		RuntimeParam: model.RuntimeParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				// TorchRunParams:            inference.DefaultAccelerateParams,
+				// ModelRunParams:            phiRunParams,
+				BaseCommand: baseCommandPresetPhiTuning,
+			},
+		},
 		ReadinessTimeout: time.Duration(30) * time.Minute,
-		BaseCommand:      baseCommandPresetPhiTuning,
 		Tag:              PresetPhiTagMap["Phi2"],
 	}
 }
diff --git a/presets/workspace/models/phi3/model.go b/presets/workspace/models/phi3/model.go
index abf63003b..34863c95d 100644
--- a/presets/workspace/models/phi3/model.go
+++ b/presets/workspace/models/phi3/model.go
@@ -42,20 +42,23 @@ var (
 	PresetPhi3_5MiniInstruct  = "phi-3.5-mini-instruct"
 
 	PresetPhiTagMap = map[string]string{
-		"Phi3Mini4kInstruct":     "0.0.2",
-		"Phi3Mini128kInstruct":   "0.0.2",
-		"Phi3Medium4kInstruct":   "0.0.2",
-		"Phi3Medium128kInstruct": "0.0.2",
+		"Phi3Mini4kInstruct":     "0.0.3",
+		"Phi3Mini128kInstruct":   "0.0.3",
+		"Phi3Medium4kInstruct":   "0.0.3",
+		"Phi3Medium128kInstruct": "0.0.3",
 		"Phi3_5MiniInstruct":     "0.0.1",
 	}
 
 	baseCommandPresetPhiInference = "accelerate launch"
-	baseCommandPresetPhiTuning    = "python3 metrics_server.py & accelerate launch"
+	baseCommandPresetPhiTuning    = "cd /workspace/tfs/ && python3 metrics_server.py & accelerate launch"
 	phiRunParams                  = map[string]string{
 		"torch_dtype":       "auto",
 		"pipeline":          "text-generation",
 		"trust_remote_code": "",
 	}
+	phiRunParamsVLLM = map[string]string{
+		"dtype": "auto",
+	}
 )
 
 var phi3MiniA phi3Mini4KInst
@@ -70,11 +73,21 @@ func (*phi3Mini4KInst) GetInferenceParameters() *model.PresetParam {
 		GPUCountRequirement:       "1",
 		TotalGPUMemoryRequirement: "9Gi",
 		PerGPUMemoryRequirement:   "0Gi", // We run Phi using native vertical model parallel, no per GPU memory requirement.
-		TorchRunParams:            inference.DefaultAccelerateParams,
-		ModelRunParams:            phiRunParams,
-		ReadinessTimeout:          time.Duration(30) * time.Minute,
-		BaseCommand:               baseCommandPresetPhiInference,
-		Tag:                       PresetPhiTagMap["Phi3Mini4kInstruct"],
+		RuntimeParam: model.RuntimeParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand:       baseCommandPresetPhiInference,
+				TorchRunParams:    inference.DefaultAccelerateParams,
+				InferenceMainFile: inference.DefautTransformersMainFile,
+				ModelRunParams:    phiRunParams,
+			},
+			VLLM: model.VLLMParam{
+				BaseCommand:    inference.DefaultVLLMCommand,
+				ModelName:      "phi-3-mini-4k-instruct",
+				ModelRunParams: phiRunParamsVLLM,
+			},
+		},
+		ReadinessTimeout: time.Duration(30) * time.Minute,
+		Tag:              PresetPhiTagMap["Phi3Mini4kInstruct"],
 	}
 }
 func (*phi3Mini4KInst) GetTuningParameters() *model.PresetParam {
@@ -88,8 +101,12 @@ func (*phi3Mini4KInst) GetTuningParameters() *model.PresetParam {
 		// TorchRunParams:            inference.DefaultAccelerateParams,
 		// ModelRunParams:            phiRunParams,
 		ReadinessTimeout: time.Duration(30) * time.Minute,
-		BaseCommand:      baseCommandPresetPhiTuning,
-		Tag:              PresetPhiTagMap["Phi3Mini4kInstruct"],
+		RuntimeParam: model.RuntimeParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand: baseCommandPresetPhiTuning,
+			},
+		},
+		Tag: PresetPhiTagMap["Phi3Mini4kInstruct"],
 	}
 }
 func (*phi3Mini4KInst) SupportDistributedInference() bool { return false }
@@ -109,11 +126,21 @@ func (*phi3Mini128KInst) GetInferenceParameters() *model.PresetParam {
 		GPUCountRequirement:       "1",
 		TotalGPUMemoryRequirement: "9Gi",
 		PerGPUMemoryRequirement:   "0Gi", // We run Phi using native vertical model parallel, no per GPU memory requirement.
-		TorchRunParams:            inference.DefaultAccelerateParams,
-		ModelRunParams:            phiRunParams,
-		ReadinessTimeout:          time.Duration(30) * time.Minute,
-		BaseCommand:               baseCommandPresetPhiInference,
-		Tag:                       PresetPhiTagMap["Phi3Mini128kInstruct"],
+		RuntimeParam: model.RuntimeParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand:       baseCommandPresetPhiInference,
+				TorchRunParams:    inference.DefaultAccelerateParams,
+				InferenceMainFile: inference.DefautTransformersMainFile,
+				ModelRunParams:    phiRunParams,
+			},
+			VLLM: model.VLLMParam{
+				BaseCommand:    inference.DefaultVLLMCommand,
+				ModelName:      "phi-3-mini-128k-instruct",
+				ModelRunParams: phiRunParamsVLLM,
+			},
+		},
+		ReadinessTimeout: time.Duration(30) * time.Minute,
+		Tag:              PresetPhiTagMap["Phi3Mini128kInstruct"],
 	}
 }
 func (*phi3Mini128KInst) GetTuningParameters() *model.PresetParam {
@@ -124,11 +151,13 @@ func (*phi3Mini128KInst) GetTuningParameters() *model.PresetParam {
 		GPUCountRequirement:       "1",
 		TotalGPUMemoryRequirement: "72Gi",
 		PerGPUMemoryRequirement:   "72Gi",
-		// TorchRunParams:            inference.DefaultAccelerateParams,
-		// ModelRunParams:            phiRunParams,
-		ReadinessTimeout: time.Duration(30) * time.Minute,
-		BaseCommand:      baseCommandPresetPhiTuning,
-		Tag:              PresetPhiTagMap["Phi3Mini128kInstruct"],
+		ReadinessTimeout:          time.Duration(30) * time.Minute,
+		RuntimeParam: model.RuntimeParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand: baseCommandPresetPhiTuning,
+			},
+		},
+		Tag: PresetPhiTagMap["Phi3Mini128kInstruct"],
 	}
 }
 func (*phi3Mini128KInst) SupportDistributedInference() bool { return false }
@@ -148,11 +177,21 @@ func (*phi3_5MiniInst) GetInferenceParameters() *model.PresetParam {
 		GPUCountRequirement:       "1",
 		TotalGPUMemoryRequirement: "8Gi",
 		PerGPUMemoryRequirement:   "0Gi", // We run Phi using native vertical model parallel, no per GPU memory requirement.
-		TorchRunParams:            inference.DefaultAccelerateParams,
-		ModelRunParams:            phiRunParams,
-		ReadinessTimeout:          time.Duration(30) * time.Minute,
-		BaseCommand:               baseCommandPresetPhiInference,
-		Tag:                       PresetPhiTagMap["Phi3_5MiniInstruct"],
+		RuntimeParam: model.RuntimeParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand:       baseCommandPresetPhiInference,
+				TorchRunParams:    inference.DefaultAccelerateParams,
+				InferenceMainFile: inference.DefautTransformersMainFile,
+				ModelRunParams:    phiRunParams,
+			},
+			VLLM: model.VLLMParam{
+				BaseCommand:    inference.DefaultVLLMCommand,
+				ModelName:      "phi-3.5-mini-instruct",
+				ModelRunParams: phiRunParamsVLLM,
+			},
+		},
+		ReadinessTimeout: time.Duration(30) * time.Minute,
+		Tag:              PresetPhiTagMap["Phi3_5MiniInstruct"],
 	}
 }
 func (*phi3_5MiniInst) GetTuningParameters() *model.PresetParam {
@@ -166,8 +205,12 @@ func (*phi3_5MiniInst) GetTuningParameters() *model.PresetParam {
 		// TorchRunParams:            inference.DefaultAccelerateParams,
 		// ModelRunParams:            phiRunParams,
 		ReadinessTimeout: time.Duration(30) * time.Minute,
-		BaseCommand:      baseCommandPresetPhiTuning,
-		Tag:              PresetPhiTagMap["Phi3_5MiniInstruct"],
+		RuntimeParam: model.RuntimeParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand: baseCommandPresetPhiTuning,
+			},
+		},
+		Tag: PresetPhiTagMap["Phi3_5MiniInstruct"],
 	}
 }
 func (*phi3_5MiniInst) SupportDistributedInference() bool { return false }
@@ -187,11 +230,21 @@ func (*Phi3Medium4kInstruct) GetInferenceParameters() *model.PresetParam {
 		GPUCountRequirement:       "1",
 		TotalGPUMemoryRequirement: "28Gi",
 		PerGPUMemoryRequirement:   "0Gi", // We run Phi using native vertical model parallel, no per GPU memory requirement.
-		TorchRunParams:            inference.DefaultAccelerateParams,
-		ModelRunParams:            phiRunParams,
-		ReadinessTimeout:          time.Duration(30) * time.Minute,
-		BaseCommand:               baseCommandPresetPhiInference,
-		Tag:                       PresetPhiTagMap["Phi3Medium4kInstruct"],
+		RuntimeParam: model.RuntimeParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand:       baseCommandPresetPhiInference,
+				TorchRunParams:    inference.DefaultAccelerateParams,
+				InferenceMainFile: inference.DefautTransformersMainFile,
+				ModelRunParams:    phiRunParams,
+			},
+			VLLM: model.VLLMParam{
+				BaseCommand:    inference.DefaultVLLMCommand,
+				ModelName:      "phi-3-medium-4k-instruct",
+				ModelRunParams: phiRunParamsVLLM,
+			},
+		},
+		ReadinessTimeout: time.Duration(30) * time.Minute,
+		Tag:              PresetPhiTagMap["Phi3Medium4kInstruct"],
 	}
 }
 func (*Phi3Medium4kInstruct) GetTuningParameters() *model.PresetParam {
@@ -205,8 +258,12 @@ func (*Phi3Medium4kInstruct) GetTuningParameters() *model.PresetParam {
 		// TorchRunParams:            inference.DefaultAccelerateParams,
 		// ModelRunParams:            phiRunParams,
 		ReadinessTimeout: time.Duration(30) * time.Minute,
-		BaseCommand:      baseCommandPresetPhiTuning,
-		Tag:              PresetPhiTagMap["Phi3Medium4kInstruct"],
+		RuntimeParam: model.RuntimeParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand: baseCommandPresetPhiTuning,
+			},
+		},
+		Tag: PresetPhiTagMap["Phi3Medium4kInstruct"],
 	}
 }
 func (*Phi3Medium4kInstruct) SupportDistributedInference() bool { return false }
@@ -226,11 +283,20 @@ func (*Phi3Medium128kInstruct) GetInferenceParameters() *model.PresetParam {
 		GPUCountRequirement:       "1",
 		TotalGPUMemoryRequirement: "28Gi",
 		PerGPUMemoryRequirement:   "0Gi", // We run Phi using native vertical model parallel, no per GPU memory requirement.
-		TorchRunParams:            inference.DefaultAccelerateParams,
-		ModelRunParams:            phiRunParams,
-		ReadinessTimeout:          time.Duration(30) * time.Minute,
-		BaseCommand:               baseCommandPresetPhiInference,
-		Tag:                       PresetPhiTagMap["Phi3Medium128kInstruct"],
+		RuntimeParam: model.RuntimeParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand:    baseCommandPresetPhiInference,
+				TorchRunParams: inference.DefaultAccelerateParams,
+				ModelRunParams: phiRunParams,
+			},
+			VLLM: model.VLLMParam{
+				BaseCommand:    inference.DefaultVLLMCommand,
+				ModelName:      "phi-3-medium-128k-instruct",
+				ModelRunParams: phiRunParamsVLLM,
+			},
+		},
+		ReadinessTimeout: time.Duration(30) * time.Minute,
+		Tag:              PresetPhiTagMap["Phi3Medium128kInstruct"],
 	}
 }
 func (*Phi3Medium128kInstruct) GetTuningParameters() *model.PresetParam {
@@ -241,11 +307,13 @@ func (*Phi3Medium128kInstruct) GetTuningParameters() *model.PresetParam {
 		GPUCountRequirement:       "1",
 		TotalGPUMemoryRequirement: "80Gi",
 		PerGPUMemoryRequirement:   "80Gi",
-		// TorchRunParams:            inference.DefaultAccelerateParams,
-		// ModelRunParams:            phiRunParams,
-		ReadinessTimeout: time.Duration(30) * time.Minute,
-		BaseCommand:      baseCommandPresetPhiTuning,
-		Tag:              PresetPhiTagMap["Phi3Medium128kInstruct"],
+		ReadinessTimeout:          time.Duration(30) * time.Minute,
+		RuntimeParam: model.RuntimeParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand: baseCommandPresetPhiTuning,
+			},
+		},
+		Tag: PresetPhiTagMap["Phi3Medium128kInstruct"],
 	}
 }
 func (*Phi3Medium128kInstruct) SupportDistributedInference() bool { return false }