diff --git a/Makefile b/Makefile
index 08ca8fcb0..4e254c1a8 100644
--- a/Makefile
+++ b/Makefile
@@ -319,11 +319,11 @@ azure-karpenter-helm:  ## Update Azure client env vars and settings in helm valu
 ##@ Build
 .PHONY: build
 build: manifests generate fmt vet ## Build manager binary.
-	go build -o bin/manager cmd/*.go
+	go build -o bin/manager cmd/workspace/*.go
 
 .PHONY: run
 run: manifests generate fmt vet ## Run a controller from your host.
-	go run ./cmd/main.go
+	go run ./cmd/workspace/main.go
 
 ##@ Build Dependencies
 ## Location to install dependencies to
diff --git a/api/v1alpha1/labels.go b/api/v1alpha1/labels.go
index 409c08ef8..65ef0ca09 100644
--- a/api/v1alpha1/labels.go
+++ b/api/v1alpha1/labels.go
@@ -3,6 +3,8 @@
 
 package v1alpha1
 
+import "github.com/azure/kaito/pkg/model"
+
 const (
 
 	// Non-prefixed labels/annotations are reserved for end-use.
@@ -27,4 +29,25 @@ const (
 
 	// WorkspaceRevisionAnnotation is the Annotations for revision number
 	WorkspaceRevisionAnnotation = "workspace.kaito.io/revision"
+
+	// AnnotationWorkspaceBackend is the annotation for backend selection.
+	AnnotationWorkspaceBackend = KAITOPrefix + "backend"
 )
+
+// GetWorkspaceBackendName returns the runtime name of the workspace.
+func GetWorkspaceBackendName(ws *Workspace) model.BackendName {
+	if ws == nil {
+		panic("workspace is nil")
+	}
+	runtime := model.BackendNameVLLM
+
+	name := ws.Annotations[AnnotationWorkspaceBackend]
+	switch name {
+	case string(model.BackendNameHuggingfaceTransformers):
+		runtime = model.BackendNameHuggingfaceTransformers
+	case string(model.BackendNameVLLM):
+		runtime = model.BackendNameVLLM
+	}
+
+	return runtime
+}
diff --git a/api/v1alpha1/workspace_validation.go b/api/v1alpha1/workspace_validation.go
index 07e45c9b7..3493b3566 100644
--- a/api/v1alpha1/workspace_validation.go
+++ b/api/v1alpha1/workspace_validation.go
@@ -169,7 +169,7 @@ func (r *TuningSpec) validateCreate(ctx context.Context, workspaceNamespace stri
 	// Currently require a preset to specified, in future we can consider defining a template
 	if r.Preset == nil {
 		errs = errs.Also(apis.ErrMissingField("Preset"))
-	} else if presetName := string(r.Preset.Name); !utils.IsValidPreset(presetName) {
+	} else if presetName := string(r.Preset.Name); !plugin.IsValidPreset(presetName) {
 		errs = errs.Also(apis.ErrInvalidValue(fmt.Sprintf("Unsupported tuning preset name %s", presetName), "presetName"))
 	}
 	return errs
@@ -407,7 +407,7 @@ func (i *InferenceSpec) validateCreate() (errs *apis.FieldError) {
 	if i.Preset != nil {
 		presetName := string(i.Preset.Name)
 		// Validate preset name
-		if !utils.IsValidPreset(presetName) {
+		if !plugin.IsValidPreset(presetName) {
 			errs = errs.Also(apis.ErrInvalidValue(fmt.Sprintf("Unsupported inference preset name %s", presetName), "presetName"))
 		}
 		// Validate private preset has private image specified
diff --git a/docker/presets/models/tfs/Dockerfile b/docker/presets/models/tfs/Dockerfile
index 61c263b7f..1b9d4cdba 100644
--- a/docker/presets/models/tfs/Dockerfile
+++ b/docker/presets/models/tfs/Dockerfile
@@ -5,31 +5,39 @@ ARG MODEL_TYPE
 ARG VERSION
 
 # Set the working directory
-WORKDIR /workspace/tfs
+WORKDIR /workspace
 
-# Write the version to a file
-RUN echo $VERSION > /workspace/tfs/version.txt
-
-# First, copy just the preset files and install dependencies
-# This is done before copying the code to utilize Docker's layer caching and
-# avoid reinstalling dependencies unless the requirements file changes.
-# Inference
 COPY kaito/presets/inference/${MODEL_TYPE}/requirements.txt /workspace/tfs/inference-requirements.txt
-RUN pip install --no-cache-dir -r inference-requirements.txt
-
-COPY kaito/presets/inference/${MODEL_TYPE}/inference_api.py /workspace/tfs/inference_api.py
-
-# Fine Tuning
 COPY kaito/presets/tuning/${MODEL_TYPE}/requirements.txt /workspace/tfs/tuning-requirements.txt
-RUN pip install --no-cache-dir -r tuning-requirements.txt
-
-COPY kaito/presets/tuning/${MODEL_TYPE}/cli.py /workspace/tfs/cli.py
-COPY kaito/presets/tuning/${MODEL_TYPE}/fine_tuning.py /workspace/tfs/fine_tuning.py
-COPY kaito/presets/tuning/${MODEL_TYPE}/parser.py /workspace/tfs/parser.py
-COPY kaito/presets/tuning/${MODEL_TYPE}/dataset.py /workspace/tfs/dataset.py
-
-# Copy the metrics server
-COPY kaito/presets/tuning/${MODEL_TYPE}/metrics/metrics_server.py /workspace/tfs/metrics_server.py
-
-# Copy the entire model weights to the weights directory
-COPY ${WEIGHTS_PATH} /workspace/tfs/weights
+COPY kaito/presets/inference/vllm/requirements.txt /workspace/vllm/inference-requirements.txt
+
+RUN pip install --no-cache-dir -r /workspace/tfs/inference-requirements.txt && \
+    pip install --no-cache-dir -r /workspace/tfs/tuning-requirements.txt && \
+    pip install --no-cache-dir -r /workspace/vllm/inference-requirements.txt
+
+# 1. Huggingface transformers
+COPY kaito/presets/inference/${MODEL_TYPE}/inference_api.py \
+    kaito/presets/tuning/${MODEL_TYPE}/cli.py \
+    kaito/presets/tuning/${MODEL_TYPE}/fine_tuning.py \
+    kaito/presets/tuning/${MODEL_TYPE}/parser.py \
+    kaito/presets/tuning/${MODEL_TYPE}/dataset.py \
+    kaito/presets/tuning/${MODEL_TYPE}/metrics/metrics_server.py \
+    /workspace/tfs/
+
+# 2. vLLM
+COPY kaito/presets/inference/vllm/inference_api.py /workspace/vllm/inference_api.py
+
+# Chat template
+RUN apt update && apt install -y git && \
+    rm /var/lib/apt/lists/* -r
+RUN git clone https://github.com/chujiezheng/chat_templates /tmp/chat_templates && \
+    cd /tmp/chat_templates && \
+    git reset --hard 670a2eb && \
+    cp -r ./chat_templates/ /workspace/ && \
+    rm -rf /tmp/chat_templates
+
+# Model weights
+COPY ${WEIGHTS_PATH} /workspace/weights
+RUN echo $VERSION > /workspace/version.txt && \
+    ln -s /workspace/weights /workspace/tfs/weights && \
+    ln -s /workspace/weights /workspace/vllm/weights
diff --git a/hack/run-pytest-in-venv.sh b/hack/run-pytest-in-venv.sh
index 3e73b5cfd..e56996d3c 100755
--- a/hack/run-pytest-in-venv.sh
+++ b/hack/run-pytest-in-venv.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-set -ex
+set -e
 
 if [ "$#" -ne 2 ]; then
     echo "Usage: $0 <test_dir> <requirements.txt>"
@@ -20,7 +20,7 @@ trap cleanup EXIT
 
 cd $VENV_DIR
 printf "Creating virtual environment in %s\n" "$VENV_DIR"
-python3 -m virtualenv venv
+python3 -m virtualenv --system-site-packages venv
 source "$VENV_DIR/venv/bin/activate"
 if [ "$?" -ne 0 ]; then
     printf "Failed to activate virtual environment\n"
diff --git a/pkg/controllers/workspace_controller.go b/pkg/controllers/workspace_controller.go
index 77a7d3f5c..c0d9028b6 100644
--- a/pkg/controllers/workspace_controller.go
+++ b/pkg/controllers/workspace_controller.go
@@ -830,7 +830,7 @@ func (c *WorkspaceReconciler) applyInference(ctx context.Context, wObj *kaitov1a
 			} else if apierrors.IsNotFound(err) {
 				var workloadObj client.Object
 				// Need to create a new workload
-				workloadObj, err = inference.CreatePresetInference(ctx, wObj, revisionStr, inferenceParam, model.SupportDistributedInference(), c.Client)
+				workloadObj, err = inference.CreatePresetInference(ctx, wObj, revisionStr, model, c.Client)
 				if err != nil {
 					return
 				}
diff --git a/pkg/inference/preset-inferences.go b/pkg/inference/preset-inferences.go
index 20e503d38..86c369efe 100644
--- a/pkg/inference/preset-inferences.go
+++ b/pkg/inference/preset-inferences.go
@@ -11,6 +11,7 @@ import (
 	"github.com/azure/kaito/pkg/utils"
 	"github.com/azure/kaito/pkg/utils/consts"
 
+	"github.com/azure/kaito/api/v1alpha1"
 	kaitov1alpha1 "github.com/azure/kaito/api/v1alpha1"
 	"github.com/azure/kaito/pkg/model"
 	"github.com/azure/kaito/pkg/resources"
@@ -22,9 +23,8 @@ import (
 )
 
 const (
-	ProbePath     = "/healthz"
-	Port5000      = int32(5000)
-	InferenceFile = "inference_api.py"
+	ProbePath = "/health"
+	Port5000  = int32(5000)
 )
 
 var (
@@ -70,7 +70,12 @@ var (
 	}
 )
 
-func updateTorchParamsForDistributedInference(ctx context.Context, kubeClient client.Client, wObj *kaitov1alpha1.Workspace, inferenceObj *model.PresetParam) error {
+func updateTorchParamsForDistributedInference(ctx context.Context, kubeClient client.Client, wObj *kaitov1alpha1.Workspace, inferenceParam *model.PresetParam) error {
+	backendName := v1alpha1.GetWorkspaceBackendName(wObj)
+	if backendName != model.BackendNameHuggingfaceTransformers {
+		return fmt.Errorf("distributed inference is not supported for backend %s", backendName)
+	}
+
 	existingService := &corev1.Service{}
 	err := resources.GetResource(ctx, wObj.Name, wObj.Namespace, kubeClient, existingService)
 	if err != nil {
@@ -78,18 +83,18 @@ func updateTorchParamsForDistributedInference(ctx context.Context, kubeClient cl
 	}
 
 	nodes := *wObj.Resource.Count
-	inferenceObj.TorchRunParams["nnodes"] = strconv.Itoa(nodes)
-	inferenceObj.TorchRunParams["nproc_per_node"] = strconv.Itoa(inferenceObj.WorldSize / nodes)
+	inferenceParam.Transformers.TorchRunParams["nnodes"] = strconv.Itoa(nodes)
+	inferenceParam.Transformers.TorchRunParams["nproc_per_node"] = strconv.Itoa(inferenceParam.WorldSize / nodes)
 	if nodes > 1 {
-		inferenceObj.TorchRunParams["node_rank"] = "$(echo $HOSTNAME | grep -o '[^-]*$')"
-		inferenceObj.TorchRunParams["master_addr"] = existingService.Spec.ClusterIP
-		inferenceObj.TorchRunParams["master_port"] = "29500"
-	}
-	if inferenceObj.TorchRunRdzvParams != nil {
-		inferenceObj.TorchRunRdzvParams["max_restarts"] = "3"
-		inferenceObj.TorchRunRdzvParams["rdzv_id"] = "job"
-		inferenceObj.TorchRunRdzvParams["rdzv_backend"] = "c10d"
-		inferenceObj.TorchRunRdzvParams["rdzv_endpoint"] =
+		inferenceParam.Transformers.TorchRunParams["node_rank"] = "$(echo $HOSTNAME | grep -o '[^-]*$')"
+		inferenceParam.Transformers.TorchRunParams["master_addr"] = existingService.Spec.ClusterIP
+		inferenceParam.Transformers.TorchRunParams["master_port"] = "29500"
+	}
+	if inferenceParam.Transformers.TorchRunRdzvParams != nil {
+		inferenceParam.Transformers.TorchRunRdzvParams["max_restarts"] = "3"
+		inferenceParam.Transformers.TorchRunRdzvParams["rdzv_id"] = "job"
+		inferenceParam.Transformers.TorchRunRdzvParams["rdzv_backend"] = "c10d"
+		inferenceParam.Transformers.TorchRunRdzvParams["rdzv_endpoint"] =
 			fmt.Sprintf("%s-0.%s-headless.%s.svc.cluster.local:29500", wObj.Name, wObj.Name, wObj.Namespace)
 	}
 	return nil
@@ -114,14 +119,17 @@ func GetInferenceImageInfo(ctx context.Context, workspaceObj *kaitov1alpha1.Work
 }
 
 func CreatePresetInference(ctx context.Context, workspaceObj *kaitov1alpha1.Workspace, revisionNum string,
-	inferenceObj *model.PresetParam, supportDistributedInference bool, kubeClient client.Client) (client.Object, error) {
-	if inferenceObj.TorchRunParams != nil && supportDistributedInference {
-		if err := updateTorchParamsForDistributedInference(ctx, kubeClient, workspaceObj, inferenceObj); err != nil {
+	model model.Model, kubeClient client.Client) (client.Object, error) {
+	inferenceParam := model.GetInferenceParameters().DeepCopy()
+
+	if model.SupportDistributedInference() {
+		if err := updateTorchParamsForDistributedInference(ctx, kubeClient, workspaceObj, inferenceParam); err != nil { //
 			klog.ErrorS(err, "failed to update torch params", "workspace", workspaceObj)
 			return nil, err
 		}
 	}
 
+	// additional volume
 	var volumes []corev1.Volume
 	var volumeMounts []corev1.VolumeMount
 	shmVolume, shmVolumeMount := utils.ConfigSHMVolume(*workspaceObj.Resource.Count)
@@ -131,24 +139,35 @@ func CreatePresetInference(ctx context.Context, workspaceObj *kaitov1alpha1.Work
 	if shmVolumeMount.Name != "" {
 		volumeMounts = append(volumeMounts, shmVolumeMount)
 	}
-
 	if len(workspaceObj.Inference.Adapters) > 0 {
 		adapterVolume, adapterVolumeMount := utils.ConfigAdapterVolume()
 		volumes = append(volumes, adapterVolume)
 		volumeMounts = append(volumeMounts, adapterVolumeMount)
 	}
 
+	// resource requirements
 	skuNumGPUs, err := utils.GetSKUNumGPUs(ctx, kubeClient, workspaceObj.Status.WorkerNodes,
-		workspaceObj.Resource.InstanceType, inferenceObj.GPUCountRequirement)
+		workspaceObj.Resource.InstanceType, inferenceParam.GPUCountRequirement)
 	if err != nil {
 		return nil, fmt.Errorf("failed to get SKU num GPUs: %v", err)
 	}
+	resourceReq := corev1.ResourceRequirements{
+		Requests: corev1.ResourceList{
+			corev1.ResourceName(resources.CapacityNvidiaGPU): resource.MustParse(skuNumGPUs),
+		},
+		Limits: corev1.ResourceList{
+			corev1.ResourceName(resources.CapacityNvidiaGPU): resource.MustParse(skuNumGPUs),
+		},
+	}
 
-	commands, resourceReq := prepareInferenceParameters(ctx, inferenceObj, skuNumGPUs)
-	image, imagePullSecrets := GetInferenceImageInfo(ctx, workspaceObj, inferenceObj)
+	// inference command
+	backendName := v1alpha1.GetWorkspaceBackendName(workspaceObj)
+	commands := inferenceParam.GetInferenceCommand(backendName)
+
+	image, imagePullSecrets := GetInferenceImageInfo(ctx, workspaceObj, inferenceParam)
 
 	var depObj client.Object
-	if supportDistributedInference {
+	if model.SupportDistributedInference() {
 		depObj = resources.GenerateStatefulSetManifest(ctx, workspaceObj, image, imagePullSecrets, *workspaceObj.Resource.Count, commands,
 			containerPorts, livenessProbe, readinessProbe, resourceReq, tolerations, volumes, volumeMounts)
 	} else {
@@ -161,25 +180,3 @@ func CreatePresetInference(ctx context.Context, workspaceObj *kaitov1alpha1.Work
 	}
 	return depObj, nil
 }
-
-// prepareInferenceParameters builds a PyTorch command:
-// torchrun <TORCH_PARAMS> <OPTIONAL_RDZV_PARAMS> baseCommand <MODEL_PARAMS>
-// and sets the GPU resources required for inference.
-// Returns the command and resource configuration.
-func prepareInferenceParameters(ctx context.Context, inferenceObj *model.PresetParam, skuNumGPUs string) ([]string, corev1.ResourceRequirements) {
-	torchCommand := utils.BuildCmdStr(inferenceObj.BaseCommand, inferenceObj.TorchRunParams)
-	torchCommand = utils.BuildCmdStr(torchCommand, inferenceObj.TorchRunRdzvParams)
-	modelCommand := utils.BuildCmdStr(InferenceFile, inferenceObj.ModelRunParams)
-	commands := utils.ShellCmd(torchCommand + " " + modelCommand)
-
-	resourceRequirements := corev1.ResourceRequirements{
-		Requests: corev1.ResourceList{
-			corev1.ResourceName(resources.CapacityNvidiaGPU): resource.MustParse(skuNumGPUs),
-		},
-		Limits: corev1.ResourceList{
-			corev1.ResourceName(resources.CapacityNvidiaGPU): resource.MustParse(skuNumGPUs),
-		},
-	}
-
-	return commands, resourceRequirements
-}
diff --git a/pkg/inference/preset-inferences_test.go b/pkg/inference/preset-inferences_test.go
index 31b7f4e84..400f66fb8 100644
--- a/pkg/inference/preset-inferences_test.go
+++ b/pkg/inference/preset-inferences_test.go
@@ -15,7 +15,6 @@ import (
 	"github.com/azure/kaito/api/v1alpha1"
 	"github.com/azure/kaito/pkg/utils/test"
 
-	"github.com/azure/kaito/pkg/model"
 	"github.com/azure/kaito/pkg/utils/plugin"
 	"github.com/stretchr/testify/mock"
 	appsv1 "k8s.io/api/apps/v1"
@@ -28,6 +27,7 @@ var ValidStrength string = "0.5"
 func TestCreatePresetInference(t *testing.T) {
 	test.RegisterTestModel()
 	testcases := map[string]struct {
+		workspace      *v1alpha1.Workspace
 		nodeCount      int
 		modelName      string
 		callMocks      func(c *test.MockClient)
@@ -37,7 +37,8 @@ func TestCreatePresetInference(t *testing.T) {
 		expectedVolume string
 	}{
 
-		"test-model": {
+		"test-model/vllm": {
+			workspace: test.MockWorkspaceWithPreset,
 			nodeCount: 1,
 			modelName: "test-model",
 			callMocks: func(c *test.MockClient) {
@@ -46,32 +47,48 @@ func TestCreatePresetInference(t *testing.T) {
 			workload: "Deployment",
 			// No BaseCommand, TorchRunParams, TorchRunRdzvParams, or ModelRunParams
 			// So expected cmd consists of shell command and inference file
-			expectedCmd: "/bin/sh -c  inference_api.py",
+			expectedCmd: "/bin/sh -c python3 /workspace/vllm/inference_api.py",
 			hasAdapters: false,
 		},
 
-		"test-distributed-model": {
+		"test-model-with-adapters/vllm": {
+			workspace: test.MockWorkspaceWithPreset,
 			nodeCount: 1,
-			modelName: "test-distributed-model",
+			modelName: "test-model",
 			callMocks: func(c *test.MockClient) {
-				c.On("Get", mock.IsType(context.TODO()), mock.Anything, mock.IsType(&corev1.Service{}), mock.Anything).Return(nil)
-				c.On("Create", mock.IsType(context.TODO()), mock.IsType(&appsv1.StatefulSet{}), mock.Anything).Return(nil)
+				c.On("Create", mock.IsType(context.TODO()), mock.IsType(&appsv1.Deployment{}), mock.Anything).Return(nil)
 			},
-			workload:    "StatefulSet",
-			expectedCmd: "/bin/sh -c  inference_api.py",
-			hasAdapters: false,
+			workload:       "Deployment",
+			expectedCmd:    "/bin/sh -c python3 /workspace/vllm/inference_api.py",
+			hasAdapters:    true,
+			expectedVolume: "adapter-volume",
 		},
 
-		"test-model-with-adapters": {
+		"test-model/transformers": {
+			workspace: test.MockWorkspaceWithPresetTransformers,
 			nodeCount: 1,
 			modelName: "test-model",
 			callMocks: func(c *test.MockClient) {
 				c.On("Create", mock.IsType(context.TODO()), mock.IsType(&appsv1.Deployment{}), mock.Anything).Return(nil)
 			},
-			workload:       "Deployment",
-			expectedCmd:    "/bin/sh -c  inference_api.py",
-			hasAdapters:    true,
-			expectedVolume: "adapter-volume",
+			workload: "Deployment",
+			// No BaseCommand, TorchRunParams, TorchRunRdzvParams, or ModelRunParams
+			// So expected cmd consists of shell command and inference file
+			expectedCmd: "/bin/sh -c accelerate launch /workspace/tfs/inference_api.py",
+			hasAdapters: false,
+		},
+
+		"test-distributed-model/transformers": {
+			workspace: test.MockWorkspaceDistributedModelTransformers,
+			nodeCount: 1,
+			modelName: "test-distributed-model",
+			callMocks: func(c *test.MockClient) {
+				c.On("Get", mock.IsType(context.TODO()), mock.Anything, mock.IsType(&corev1.Service{}), mock.Anything).Return(nil)
+				c.On("Create", mock.IsType(context.TODO()), mock.IsType(&appsv1.StatefulSet{}), mock.Anything).Return(nil)
+			},
+			workload:    "StatefulSet",
+			expectedCmd: "/bin/sh -c accelerate launch --nnodes=1 --nproc_per_node=0 --max_restarts=3 --rdzv_id=job --rdzv_backend=c10d --rdzv_endpoint=testWorkspace-0.testWorkspace-headless.kaito.svc.cluster.local:29500 /workspace/tfs/inference_api.p",
+			hasAdapters: false,
 		},
 	}
 
@@ -81,7 +98,7 @@ func TestCreatePresetInference(t *testing.T) {
 			mockClient := test.NewClient()
 			tc.callMocks(mockClient)
 
-			workspace := test.MockWorkspaceWithPreset
+			workspace := tc.workspace
 			workspace.Resource.Count = &tc.nodeCount
 
 			if tc.hasAdapters {
@@ -96,15 +113,8 @@ func TestCreatePresetInference(t *testing.T) {
 				}
 			}
 
-			useHeadlessSvc := false
-
-			var inferenceObj *model.PresetParam
 			model := plugin.KaitoModelRegister.MustGet(tc.modelName)
-			inferenceObj = model.GetInferenceParameters()
 
-			if strings.Contains(tc.modelName, "distributed") {
-				useHeadlessSvc = true
-			}
 			svc := &corev1.Service{
 				ObjectMeta: v1.ObjectMeta{
 					Name:      workspace.Name,
@@ -116,7 +126,7 @@ func TestCreatePresetInference(t *testing.T) {
 			}
 			mockClient.CreateOrUpdateObjectInMap(svc)
 
-			createdObject, _ := CreatePresetInference(context.TODO(), workspace, test.MockWorkspaceWithPresetHash, inferenceObj, useHeadlessSvc, mockClient)
+			createdObject, _ := CreatePresetInference(context.TODO(), workspace, test.MockWorkspaceWithPresetHash, model, mockClient)
 			createdWorkload := ""
 			switch createdObject.(type) {
 			case *appsv1.Deployment:
diff --git a/pkg/model/interface.go b/pkg/model/interface.go
index 56c925698..6f9996d33 100644
--- a/pkg/model/interface.go
+++ b/pkg/model/interface.go
@@ -4,6 +4,8 @@ package model
 
 import (
 	"time"
+
+	"github.com/azure/kaito/pkg/utils"
 )
 
 type Model interface {
@@ -13,23 +15,131 @@ type Model interface {
 	SupportTuning() bool
 }
 
+// BackendName is LLM runtime name.
+type BackendName string
+
+const (
+	BackendNameHuggingfaceTransformers BackendName = "transformers"
+	BackendNameVLLM                    BackendName = "vllm"
+
+	InferenceFileHuggingface = "/workspace/tfs/inference_api.py"
+	InferenceFileVLLM        = "/workspace/vllm/inference_api.py"
+)
+
 // PresetParam defines the preset inference parameters for a model.
 type PresetParam struct {
-	ModelFamilyName               string            // The name of the model family.
-	ImageAccessMode               string            // Defines where the Image is Public or Private.
-	DiskStorageRequirement        string            // Disk storage requirements for the model.
-	GPUCountRequirement           string            // Number of GPUs required for the Preset. Used for inference.
-	TotalGPUMemoryRequirement     string            // Total GPU memory required for the Preset. Used for inference.
-	PerGPUMemoryRequirement       string            // GPU memory required per GPU. Used for inference.
-	TuningPerGPUMemoryRequirement map[string]int    // Min GPU memory per tuning method (batch size 1). Used for tuning.
-	TorchRunParams                map[string]string // Parameters for configuring the torchrun command.
-	TorchRunRdzvParams            map[string]string // Optional rendezvous parameters for distributed training/inference using torchrun (elastic).
-	BaseCommand                   string            // The initial command (e.g., 'torchrun', 'accelerate launch') used in the command line.
-	ModelRunParams                map[string]string // Parameters for running the model training/inference.
+	Tag             string // The model image tag
+	ModelFamilyName string // The name of the model family.
+	ImageAccessMode string // Defines where the Image is Public or Private.
+
+	DiskStorageRequirement        string         // Disk storage requirements for the model.
+	GPUCountRequirement           string         // Number of GPUs required for the Preset. Used for inference.
+	TotalGPUMemoryRequirement     string         // Total GPU memory required for the Preset. Used for inference.
+	PerGPUMemoryRequirement       string         // GPU memory required per GPU. Used for inference.
+	TuningPerGPUMemoryRequirement map[string]int // Min GPU memory per tuning method (batch size 1). Used for tuning.
+	WorldSize                     int            // Defines the number of processes required for distributed inference.
+
+	BackendParam
+
 	// ReadinessTimeout defines the maximum duration for creating the workload.
 	// This timeout accommodates the size of the image, ensuring pull completion
 	// even under slower network conditions or unforeseen delays.
 	ReadinessTimeout time.Duration
-	WorldSize        int    // Defines the number of processes required for distributed inference.
-	Tag              string // The model image tag
+}
+
+// BackendParam defines the llm backend parameters.
+type BackendParam struct {
+	Transformers HuggingfaceTransformersParam
+	VLLM         VLLMParam
+}
+
+type HuggingfaceTransformersParam struct {
+	BaseCommand        string            // The initial command (e.g., 'torchrun', 'accelerate launch') used in the command line.
+	TorchRunParams     map[string]string // Parameters for configuring the torchrun command.
+	TorchRunRdzvParams map[string]string // Optional rendezvous parameters for distributed training/inference using torchrun (elastic).
+	ModelRunParams     map[string]string // Parameters for running the model training/inference.
+}
+
+type VLLMParam struct {
+	BaseCommand        string            // The initial command (e.g., 'torchrun', 'accelerate launch') used in the command line.
+	DistributionParams map[string]string // Parameters for distributed inference.
+	ModelRunParams     map[string]string // Parameters for running the model training/inference.
+}
+
+func (p *PresetParam) DeepCopy() *PresetParam {
+	if p == nil {
+		return nil
+	}
+	out := new(PresetParam)
+	*out = *p
+	out.BackendParam = p.BackendParam.DeepCopy()
+	out.TuningPerGPUMemoryRequirement = make(map[string]int, len(p.TuningPerGPUMemoryRequirement))
+	for k, v := range p.TuningPerGPUMemoryRequirement {
+		out.TuningPerGPUMemoryRequirement[k] = v
+	}
+	return out
+}
+
+func (rp *BackendParam) DeepCopy() BackendParam {
+	if rp == nil {
+		return BackendParam{}
+	}
+	out := BackendParam{}
+	out.Transformers = rp.Transformers.DeepCopy()
+	out.VLLM = rp.VLLM.DeepCopy()
+	return out
+}
+
+func (h *HuggingfaceTransformersParam) DeepCopy() HuggingfaceTransformersParam {
+	if h == nil {
+		return HuggingfaceTransformersParam{}
+	}
+	out := HuggingfaceTransformersParam{}
+	out.BaseCommand = h.BaseCommand
+	out.TorchRunParams = make(map[string]string, len(h.TorchRunParams))
+	for k, v := range h.TorchRunParams {
+		out.TorchRunParams[k] = v
+	}
+	out.TorchRunRdzvParams = make(map[string]string, len(h.TorchRunRdzvParams))
+	for k, v := range h.TorchRunRdzvParams {
+		out.TorchRunRdzvParams[k] = v
+	}
+	out.ModelRunParams = make(map[string]string, len(h.ModelRunParams))
+	for k, v := range h.ModelRunParams {
+		out.ModelRunParams[k] = v
+	}
+	return out
+}
+
+func (v *VLLMParam) DeepCopy() VLLMParam {
+	if v == nil {
+		return VLLMParam{}
+	}
+	out := VLLMParam{}
+	out.BaseCommand = v.BaseCommand
+	out.DistributionParams = make(map[string]string, len(v.DistributionParams))
+	for k, v := range v.DistributionParams {
+		out.DistributionParams[k] = v
+	}
+	out.ModelRunParams = make(map[string]string, len(v.ModelRunParams))
+	for k, v := range v.ModelRunParams {
+		out.ModelRunParams[k] = v
+	}
+	return out
+}
+
+// builds the container command:
+// eg. torchrun <TORCH_PARAMS> <OPTIONAL_RDZV_PARAMS> baseCommand <MODEL_PARAMS>
+func (p *PresetParam) GetInferenceCommand(backend BackendName) []string {
+	switch backend {
+	case BackendNameHuggingfaceTransformers:
+		torchCommand := utils.BuildCmdStr(p.Transformers.BaseCommand, p.Transformers.TorchRunParams, p.Transformers.TorchRunRdzvParams)
+		modelCommand := utils.BuildCmdStr(InferenceFileHuggingface, p.Transformers.ModelRunParams)
+		return utils.ShellCmd(torchCommand + " " + modelCommand)
+	case BackendNameVLLM:
+		modelCommand := utils.BuildCmdStr(InferenceFileVLLM, p.VLLM.ModelRunParams)
+		return utils.ShellCmd(p.VLLM.BaseCommand + " " + modelCommand)
+	default:
+		return nil
+	}
 }
diff --git a/pkg/tuning/preset-tuning.go b/pkg/tuning/preset-tuning.go
index ad81115a5..5743d2718 100644
--- a/pkg/tuning/preset-tuning.go
+++ b/pkg/tuning/preset-tuning.go
@@ -490,7 +490,7 @@ func handleURLDataSource(ctx context.Context, workspaceObj *kaitov1alpha1.Worksp
 }
 
 func prepareModelRunParameters(ctx context.Context, tuningObj *model.PresetParam) (string, error) {
-	modelCommand := utils.BuildCmdStr(TuningFile, tuningObj.ModelRunParams)
+	modelCommand := utils.BuildCmdStr(TuningFile, tuningObj.Transformers.ModelRunParams)
 	return modelCommand, nil
 }
 
@@ -500,14 +500,14 @@ func prepareModelRunParameters(ctx context.Context, tuningObj *model.PresetParam
 // Returns the command and resource configuration.
 func prepareTuningParameters(ctx context.Context, wObj *kaitov1alpha1.Workspace, modelCommand string,
 	tuningObj *model.PresetParam, skuNumGPUs string) ([]string, corev1.ResourceRequirements) {
-	if tuningObj.TorchRunParams == nil {
-		tuningObj.TorchRunParams = make(map[string]string)
+	hfParam := tuningObj.Transformers // Only support Huggingface for now
+	if hfParam.TorchRunParams == nil {
+		hfParam.TorchRunParams = make(map[string]string)
 	}
 	// Set # of processes to GPU Count
 	numProcesses := getInstanceGPUCount(wObj.Resource.InstanceType)
-	tuningObj.TorchRunParams["num_processes"] = fmt.Sprintf("%d", numProcesses)
-	torchCommand := utils.BuildCmdStr(tuningObj.BaseCommand, tuningObj.TorchRunParams)
-	torchCommand = utils.BuildCmdStr(torchCommand, tuningObj.TorchRunRdzvParams)
+	hfParam.TorchRunParams["num_processes"] = fmt.Sprintf("%d", numProcesses)
+	torchCommand := utils.BuildCmdStr(hfParam.BaseCommand, hfParam.TorchRunParams, hfParam.TorchRunRdzvParams)
 	commands := utils.ShellCmd(torchCommand + " " + modelCommand)
 
 	resourceRequirements := corev1.ResourceRequirements{
diff --git a/pkg/tuning/preset-tuning_test.go b/pkg/tuning/preset-tuning_test.go
index 99344ddcb..405e86271 100644
--- a/pkg/tuning/preset-tuning_test.go
+++ b/pkg/tuning/preset-tuning_test.go
@@ -416,9 +416,13 @@ func TestPrepareTuningParameters(t *testing.T) {
 			},
 			modelCommand: "model-command",
 			tuningObj: &model.PresetParam{
-				BaseCommand:         "python train.py",
-				TorchRunParams:      map[string]string{},
-				TorchRunRdzvParams:  map[string]string{},
+				BackendParam: model.BackendParam{
+					Transformers: model.HuggingfaceTransformersParam{
+						BaseCommand:        "python train.py",
+						TorchRunParams:     map[string]string{},
+						TorchRunRdzvParams: map[string]string{},
+					},
+				},
 				GPUCountRequirement: "2",
 			},
 			expectedCommands: []string{"/bin/sh", "-c", "python train.py --num_processes=1 model-command"},
diff --git a/pkg/utils/common-preset.go b/pkg/utils/common-preset.go
index df96276d4..c9f5f8dd0 100644
--- a/pkg/utils/common-preset.go
+++ b/pkg/utils/common-preset.go
@@ -3,7 +3,6 @@
 package utils
 
 import (
-	"github.com/azure/kaito/pkg/utils/plugin"
 	corev1 "k8s.io/api/core/v1"
 )
 
@@ -150,7 +149,3 @@ func ConfigAdapterVolume() (corev1.Volume, corev1.VolumeMount) {
 	}
 	return volume, volumeMount
 }
-
-func IsValidPreset(preset string) bool {
-	return plugin.KaitoModelRegister.Has(preset)
-}
diff --git a/pkg/utils/common.go b/pkg/utils/common.go
index 94209bf83..8b53f14f6 100644
--- a/pkg/utils/common.go
+++ b/pkg/utils/common.go
@@ -66,13 +66,15 @@ func MergeConfigMaps(baseMap, overrideMap map[string]string) map[string]string {
 	return merged
 }
 
-func BuildCmdStr(baseCommand string, runParams map[string]string) string {
+func BuildCmdStr(baseCommand string, runParams ...map[string]string) string {
 	updatedBaseCommand := baseCommand
-	for key, value := range runParams {
-		if value == "" {
-			updatedBaseCommand = fmt.Sprintf("%s --%s", updatedBaseCommand, key)
-		} else {
-			updatedBaseCommand = fmt.Sprintf("%s --%s=%s", updatedBaseCommand, key, value)
+	for _, runParam := range runParams {
+		for key, value := range runParam {
+			if value == "" {
+				updatedBaseCommand = fmt.Sprintf("%s --%s", updatedBaseCommand, key)
+			} else {
+				updatedBaseCommand = fmt.Sprintf("%s --%s=%s", updatedBaseCommand, key, value)
+			}
 		}
 	}
 
diff --git a/pkg/utils/common_test.go b/pkg/utils/common_test.go
index e23997692..b214150e7 100644
--- a/pkg/utils/common_test.go
+++ b/pkg/utils/common_test.go
@@ -2,7 +2,6 @@ package utils
 
 import (
 	"context"
-	"sigs.k8s.io/controller-runtime/pkg/client"
 	"testing"
 
 	"github.com/stretchr/testify/assert"
@@ -12,6 +11,7 @@ import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/client-go/kubernetes/scheme"
+	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/client/fake"
 )
 
diff --git a/pkg/utils/plugin/plugin.go b/pkg/utils/plugin/plugin.go
index 6f186a9f9..e8708e7f0 100644
--- a/pkg/utils/plugin/plugin.go
+++ b/pkg/utils/plugin/plugin.go
@@ -60,3 +60,7 @@ func (reg *ModelRegister) Has(name string) bool {
 	_, ok := reg.models[name]
 	return ok
 }
+
+func IsValidPreset(preset string) bool {
+	return KaitoModelRegister.Has(preset)
+}
diff --git a/pkg/utils/test/testModel.go b/pkg/utils/test/testModel.go
index d12d3f720..83f52b66a 100644
--- a/pkg/utils/test/testModel.go
+++ b/pkg/utils/test/testModel.go
@@ -15,7 +15,15 @@ type testModel struct{}
 func (*testModel) GetInferenceParameters() *model.PresetParam {
 	return &model.PresetParam{
 		GPUCountRequirement: "1",
-		ReadinessTimeout:    time.Duration(30) * time.Minute,
+		BackendParam: model.BackendParam{
+			VLLM: model.VLLMParam{
+				BaseCommand: "python3",
+			},
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand: "accelerate launch",
+			},
+		},
+		ReadinessTimeout: time.Duration(30) * time.Minute,
 	}
 }
 func (*testModel) GetTuningParameters() *model.PresetParam {
@@ -36,7 +44,15 @@ type testDistributedModel struct{}
 func (*testDistributedModel) GetInferenceParameters() *model.PresetParam {
 	return &model.PresetParam{
 		GPUCountRequirement: "1",
-		ReadinessTimeout:    time.Duration(30) * time.Minute,
+		BackendParam: model.BackendParam{
+			VLLM: model.VLLMParam{
+				BaseCommand: "python3",
+			},
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand: "accelerate launch",
+			},
+		},
+		ReadinessTimeout: time.Duration(30) * time.Minute,
 	}
 }
 func (*testDistributedModel) GetTuningParameters() *model.PresetParam {
diff --git a/pkg/utils/test/testUtils.go b/pkg/utils/test/testUtils.go
index 3e6410efc..ba6c762c6 100644
--- a/pkg/utils/test/testUtils.go
+++ b/pkg/utils/test/testUtils.go
@@ -6,6 +6,7 @@ package test
 import (
 	"github.com/aws/karpenter-core/pkg/apis/v1alpha5"
 	"github.com/azure/kaito/api/v1alpha1"
+	"github.com/azure/kaito/pkg/model"
 	"github.com/samber/lo"
 	appsv1 "k8s.io/api/apps/v1"
 	corev1 "k8s.io/api/core/v1"
@@ -47,6 +48,31 @@ var (
 			},
 		},
 	}
+	MockWorkspaceDistributedModelTransformers = &v1alpha1.Workspace{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      "testWorkspace",
+			Namespace: "kaito",
+			Annotations: map[string]string{
+				v1alpha1.AnnotationWorkspaceBackend: string(model.BackendNameHuggingfaceTransformers),
+			},
+		},
+		Resource: v1alpha1.ResourceSpec{
+			Count:        &gpuNodeCount,
+			InstanceType: "Standard_NC12s_v3",
+			LabelSelector: &metav1.LabelSelector{
+				MatchLabels: map[string]string{
+					"apps": "test",
+				},
+			},
+		},
+		Inference: &v1alpha1.InferenceSpec{
+			Preset: &v1alpha1.PresetSpec{
+				PresetMeta: v1alpha1.PresetMeta{
+					Name: "test-distributed-model",
+				},
+			},
+		},
+	}
 	MockWorkspaceWithPreferredNodes = &v1alpha1.Workspace{
 		ObjectMeta: metav1.ObjectMeta{
 			Name:      "testWorkspace",
@@ -115,6 +141,31 @@ var (
 			},
 		},
 	}
+	MockWorkspaceWithPresetTransformers = &v1alpha1.Workspace{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      "testWorkspace",
+			Namespace: "kaito",
+			Annotations: map[string]string{
+				v1alpha1.AnnotationWorkspaceBackend: string(model.BackendNameHuggingfaceTransformers),
+			},
+		},
+		Resource: v1alpha1.ResourceSpec{
+			Count:        &gpuNodeCount,
+			InstanceType: "Standard_NC12s_v3",
+			LabelSelector: &metav1.LabelSelector{
+				MatchLabels: map[string]string{
+					"apps": "test",
+				},
+			},
+		},
+		Inference: &v1alpha1.InferenceSpec{
+			Preset: &v1alpha1.PresetSpec{
+				PresetMeta: v1alpha1.PresetMeta{
+					Name: "test-model",
+				},
+			},
+		},
+	}
 )
 
 var MockWorkspaceWithPresetHash = "89ae127050ec264a5ce84db48ef7226574cdf1299e6bd27fe90b927e34cc8adb"
diff --git a/presets/inference/text-generation/inference_api.py b/presets/inference/text-generation/inference_api.py
index e9aca92e4..eb875cdeb 100644
--- a/presets/inference/text-generation/inference_api.py
+++ b/presets/inference/text-generation/inference_api.py
@@ -2,9 +2,12 @@
 # Licensed under the MIT license.
 import logging
 import os
-import subprocess
+import sys
+import signal
+import codecs
+from pathlib import Path
 from dataclasses import asdict, dataclass, field
-from typing import Annotated, Any, Dict, List, Optional
+from typing import Annotated, Any, Dict, List, Optional, Union
 
 import GPUtil
 import psutil
@@ -30,7 +33,7 @@ class ModelConfig:
     """
     Transformers Model Configuration Parameters
     """
-    pipeline: str = field(metadata={"help": "The model pipeline for the pre-trained model"})
+    pipeline: Optional[str] = field(default="text-generation", metadata={"help": "The model pipeline for the pre-trained model"})
     pretrained_model_name_or_path: Optional[str] = field(default="/workspace/tfs/weights", metadata={"help": "Path to the pretrained model or model identifier from huggingface.co/models"})
     combination_type: Optional[str]=field(default="svd", metadata={"help": "The combination type of multi adapters"})
     state_dict: Optional[Dict[str, Any]] = field(default=None, metadata={"help": "State dictionary for the model"})
@@ -47,6 +50,7 @@ class ModelConfig:
     load_in_8bit: bool = field(default=False, metadata={"help": "Load model in 8-bit mode"})
     torch_dtype: Optional[str] = field(default=None, metadata={"help": "The torch dtype for the pre-trained model"})
     device_map: str = field(default="auto", metadata={"help": "The device map for the pre-trained model"})
+    chat_template: Optional[str] = field(default=None, metadata={"help": "The file path to the chat template, or the template in single-line form for the specified model"})
 
     # Method to process additional arguments
     def process_additional_args(self, addt_args: List[str]):
@@ -83,7 +87,22 @@ def __post_init__(self): # validate parameters
         supported_pipelines = {"conversational", "text-generation"}
         if self.pipeline not in supported_pipelines:
             raise ValueError(f"Unsupported pipeline: {self.pipeline}")
-        
+
+def load_chat_template(chat_template: Optional[str]) -> Optional[str]:
+    logger.info(chat_template)
+    if chat_template is None:
+        return None
+
+    JINJA_CHARS = "{}\n"
+    if any(c in chat_template for c in JINJA_CHARS):
+        resolved_chat_template = codecs.decode(chat_template, "unicode_escape")
+    else:
+        resolved_chat_template = Path(chat_template).read_text()
+
+    logger.info("Chat template loaded successfully")
+    logger.info("Chat template: %s", resolved_chat_template)
+    return resolved_chat_template
+
 
 parser = HfArgumentParser(ModelConfig)
 args, additional_args = parser.parse_args_into_dataclasses(
@@ -98,7 +117,10 @@ def __post_init__(self): # validate parameters
 combination_type = model_args.pop('combination_type')
 
 app = FastAPI()
+resovled_chat_template = load_chat_template(model_args.pop('chat_template'))
 tokenizer = AutoTokenizer.from_pretrained(**model_args)
+if resovled_chat_template is not None:
+    tokenizer.chat_template = resovled_chat_template
 base_model = AutoModelForCausalLM.from_pretrained(**model_args)
 
 if not os.path.exists(ADAPTERS_DIR):
@@ -153,7 +175,7 @@ def __post_init__(self): # validate parameters
     pipeline_kwargs["torch_dtype"] = args.torch_dtype
 
 pipeline = transformers.pipeline(
-    model_pipeline,
+    task="text-generation",
     model=model,
     tokenizer=tokenizer,
     **pipeline_kwargs
@@ -492,7 +514,11 @@ def get_metrics():
         logger.error(f"Error fetching metrics: {e}")
         raise HTTPException(status_code=500, detail=str(e))
 
+def shutdown_handler(sig, frame):
+    sys.exit(0)
+
 if __name__ == "__main__":
+    signal.signal(signal.SIGINT, shutdown_handler)
     local_rank = int(os.environ.get("LOCAL_RANK", 0)) # Default to 0 if not set
     port = 5000 + local_rank # Adjust port based on local rank
     logger.info(f"Starting server on port {port}")
diff --git a/presets/inference/text-generation/requirements.txt b/presets/inference/text-generation/requirements.txt
index 2530fed6a..bc92259a5 100644
--- a/presets/inference/text-generation/requirements.txt
+++ b/presets/inference/text-generation/requirements.txt
@@ -1,14 +1,14 @@
 # Dependencies for TFS
 
 # Core Dependencies
-transformers==4.41.2
-torch==2.2.0
+transformers >= 4.45.0
+torch==2.4.0
 accelerate==0.30.1
 fastapi>=0.111.0,<0.112.0  # Allow patch updates
-pydantic==2.7.4
+pydantic>=2.9
 uvicorn[standard]>=0.29.0,<0.30.0  # Allow patch updates
 peft==0.11.1
-numpy==1.22.4
+numpy<3.0,>=1.25.0
 sentencepiece==0.2.0
 
 # Utility libraries
diff --git a/presets/inference/text-generation/tests/test_inference_api.py b/presets/inference/text-generation/tests/test_inference_api.py
index baedbb832..e315d372d 100644
--- a/presets/inference/text-generation/tests/test_inference_api.py
+++ b/presets/inference/text-generation/tests/test_inference_api.py
@@ -13,6 +13,10 @@
 # Add the parent directory to sys.path
 sys.path.append(parent_dir)
 
+CHAT_TEMPLATE = ("{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}"
+    "{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}"
+    "{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}")
+
 @pytest.fixture(params=[
     {"pipeline": "text-generation", "model_path": "stanford-crfm/alias-gpt2-small-x21", "device": "cpu"},
     {"pipeline": "conversational", "model_path": "stanford-crfm/alias-gpt2-small-x21", "device": "cpu"},
@@ -25,7 +29,8 @@ def configured_app(request):
         '--pipeline', request.param['pipeline'],
         '--pretrained_model_name_or_path', request.param['model_path'],
         '--device_map', request.param['device'],
-        '--allow_remote_files', 'True'
+        '--allow_remote_files', 'True',
+        '--chat_template', CHAT_TEMPLATE
     ]
     sys.argv = test_args
 
diff --git a/presets/inference/vllm/inference_api.py b/presets/inference/vllm/inference_api.py
index ab2613e9e..5b9a2d881 100644
--- a/presets/inference/vllm/inference_api.py
+++ b/presets/inference/vllm/inference_api.py
@@ -25,7 +25,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
 
     # See https://docs.vllm.ai/en/latest/models/engine_args.html for more args
     engine_default_args = {
-        "model": "/workspace/tfs/weights",
+        "model": "/workspace/vllm/weights",
         "cpu-offload-gb": 0,
         "gpu-memory-utilization": 0.9,
         "swap-space": 4,
diff --git a/presets/inference/vllm/requirements.txt b/presets/inference/vllm/requirements.txt
index 4481a9966..e54487e83 100644
--- a/presets/inference/vllm/requirements.txt
+++ b/presets/inference/vllm/requirements.txt
@@ -3,9 +3,10 @@
 # Core Dependencies
 vllm==0.6.3
 torch==2.4.0
+transformers >= 4.45.0
 uvloop
 numpy
 
 # For UTs
 pytest
-requests
\ No newline at end of file
+requests
diff --git a/presets/models/falcon/model.go b/presets/models/falcon/model.go
index 74c39995f..dc9b2fb00 100644
--- a/presets/models/falcon/model.go
+++ b/presets/models/falcon/model.go
@@ -3,9 +3,10 @@
 package falcon
 
 import (
-	"github.com/azure/kaito/pkg/tuning"
 	"time"
 
+	"github.com/azure/kaito/pkg/tuning"
+
 	kaitov1alpha1 "github.com/azure/kaito/api/v1alpha1"
 	"github.com/azure/kaito/pkg/inference"
 	"github.com/azure/kaito/pkg/model"
@@ -47,8 +48,13 @@ var (
 	baseCommandPresetFalconInference = "accelerate launch"
 	baseCommandPresetFalconTuning    = "python3 metrics_server.py & accelerate launch"
 	falconRunParams                  = map[string]string{
-		"torch_dtype": "bfloat16",
-		"pipeline":    "text-generation",
+		"torch_dtype":   "bfloat16",
+		"pipeline":      "text-generation",
+		"chat_template": "/workspace/chat_templates/falcon-instruct.jinja",
+	}
+	falconRunParamsVLLM = map[string]string{
+		"dtype":         "bfloat16",
+		"chat-template": "/workspace/chat_templates/falcon-instruct.jinja",
 	}
 )
 
@@ -64,11 +70,19 @@ func (*falcon7b) GetInferenceParameters() *model.PresetParam {
 		GPUCountRequirement:       "1",
 		TotalGPUMemoryRequirement: "14Gi",
 		PerGPUMemoryRequirement:   "0Gi", // We run Falcon using native vertical model parallel, no per GPU memory requirement.
-		TorchRunParams:            inference.DefaultAccelerateParams,
-		ModelRunParams:            falconRunParams,
-		ReadinessTimeout:          time.Duration(30) * time.Minute,
-		BaseCommand:               baseCommandPresetFalconInference,
-		Tag:                       PresetFalconTagMap["Falcon7B"],
+		BackendParam: model.BackendParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand:    baseCommandPresetFalconInference,
+				TorchRunParams: inference.DefaultAccelerateParams,
+				ModelRunParams: falconRunParams,
+			},
+			VLLM: model.VLLMParam{
+				BaseCommand:    "python3",
+				ModelRunParams: falconRunParamsVLLM,
+			},
+		},
+		ReadinessTimeout: time.Duration(30) * time.Minute,
+		Tag:              PresetFalconTagMap["Falcon7B"],
 	}
 }
 func (*falcon7b) GetTuningParameters() *model.PresetParam {
@@ -79,10 +93,14 @@ func (*falcon7b) GetTuningParameters() *model.PresetParam {
 		GPUCountRequirement:       "1",
 		TotalGPUMemoryRequirement: "16Gi",
 		PerGPUMemoryRequirement:   "16Gi",
-		TorchRunParams:            tuning.DefaultAccelerateParams,
-		//ModelRunPrams:             falconRunTuningParams, // TODO
+		BackendParam: model.BackendParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand:    baseCommandPresetFalconTuning,
+				TorchRunParams: tuning.DefaultAccelerateParams,
+				//ModelRunPrams:             falconRunTuningParams, // TODO
+			},
+		},
 		ReadinessTimeout:              time.Duration(30) * time.Minute,
-		BaseCommand:                   baseCommandPresetFalconTuning,
 		Tag:                           PresetFalconTagMap["Falcon7B"],
 		TuningPerGPUMemoryRequirement: map[string]int{"qlora": 16},
 	}
@@ -107,11 +125,19 @@ func (*falcon7bInst) GetInferenceParameters() *model.PresetParam {
 		GPUCountRequirement:       "1",
 		TotalGPUMemoryRequirement: "14Gi",
 		PerGPUMemoryRequirement:   "0Gi", // We run Falcon using native vertical model parallel, no per GPU memory requirement.
-		TorchRunParams:            inference.DefaultAccelerateParams,
-		ModelRunParams:            falconRunParams,
-		ReadinessTimeout:          time.Duration(30) * time.Minute,
-		BaseCommand:               baseCommandPresetFalconInference,
-		Tag:                       PresetFalconTagMap["Falcon7BInstruct"],
+		BackendParam: model.BackendParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand:    baseCommandPresetFalconInference,
+				TorchRunParams: inference.DefaultAccelerateParams,
+				ModelRunParams: falconRunParams,
+			},
+			VLLM: model.VLLMParam{
+				BaseCommand:    "python3",
+				ModelRunParams: falconRunParamsVLLM,
+			},
+		},
+		ReadinessTimeout: time.Duration(30) * time.Minute,
+		Tag:              PresetFalconTagMap["Falcon7BInstruct"],
 	}
 
 }
@@ -137,13 +163,20 @@ func (*falcon40b) GetInferenceParameters() *model.PresetParam {
 		GPUCountRequirement:       "2",
 		TotalGPUMemoryRequirement: "90Gi",
 		PerGPUMemoryRequirement:   "0Gi", // We run Falcon using native vertical model parallel, no per GPU memory requirement.
-		TorchRunParams:            inference.DefaultAccelerateParams,
-		ModelRunParams:            falconRunParams,
-		ReadinessTimeout:          time.Duration(30) * time.Minute,
-		BaseCommand:               baseCommandPresetFalconInference,
-		Tag:                       PresetFalconTagMap["Falcon40B"],
+		BackendParam: model.BackendParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand:    baseCommandPresetFalconInference,
+				TorchRunParams: inference.DefaultAccelerateParams,
+				ModelRunParams: falconRunParams,
+			},
+			VLLM: model.VLLMParam{
+				BaseCommand:    "python3",
+				ModelRunParams: falconRunParamsVLLM,
+			},
+		},
+		ReadinessTimeout: time.Duration(30) * time.Minute,
+		Tag:              PresetFalconTagMap["Falcon40B"],
 	}
-
 }
 func (*falcon40b) GetTuningParameters() *model.PresetParam {
 	return &model.PresetParam{
@@ -153,10 +186,14 @@ func (*falcon40b) GetTuningParameters() *model.PresetParam {
 		GPUCountRequirement:       "2",
 		TotalGPUMemoryRequirement: "90Gi",
 		PerGPUMemoryRequirement:   "16Gi",
-		TorchRunParams:            tuning.DefaultAccelerateParams,
-		//ModelRunPrams:             falconRunTuningParams, // TODO
+		BackendParam: model.BackendParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand:    baseCommandPresetFalconTuning,
+				TorchRunParams: tuning.DefaultAccelerateParams,
+				//ModelRunPrams:             falconRunTuningParams, // TODO
+			},
+		},
 		ReadinessTimeout: time.Duration(30) * time.Minute,
-		BaseCommand:      baseCommandPresetFalconTuning,
 		Tag:              PresetFalconTagMap["Falcon40B"],
 	}
 }
@@ -179,11 +216,19 @@ func (*falcon40bInst) GetInferenceParameters() *model.PresetParam {
 		GPUCountRequirement:       "2",
 		TotalGPUMemoryRequirement: "90Gi",
 		PerGPUMemoryRequirement:   "0Gi", // We run Falcon using native vertical model parallel, no per GPU memory requirement.
-		TorchRunParams:            inference.DefaultAccelerateParams,
-		ModelRunParams:            falconRunParams,
-		ReadinessTimeout:          time.Duration(30) * time.Minute,
-		BaseCommand:               baseCommandPresetFalconInference,
-		Tag:                       PresetFalconTagMap["Falcon40BInstruct"],
+		BackendParam: model.BackendParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand:    baseCommandPresetFalconInference,
+				TorchRunParams: inference.DefaultAccelerateParams,
+				ModelRunParams: falconRunParams,
+			},
+			VLLM: model.VLLMParam{
+				BaseCommand:    "python3",
+				ModelRunParams: falconRunParamsVLLM,
+			},
+		},
+		ReadinessTimeout: time.Duration(30) * time.Minute,
+		Tag:              PresetFalconTagMap["Falcon40BInstruct"],
 	}
 }
 func (*falcon40bInst) GetTuningParameters() *model.PresetParam {
diff --git a/presets/models/llama2/model.go b/presets/models/llama2/model.go
index 6a62a8987..c8581798c 100644
--- a/presets/models/llama2/model.go
+++ b/presets/models/llama2/model.go
@@ -32,6 +32,9 @@ var (
 		"max_seq_len":    "512",
 		"max_batch_size": "8",
 	}
+	llamaRunParamsVLLM = map[string]string{
+		"max-seq-len-to-capture": "512",
+	}
 )
 
 var llama2A llama2Text7b
@@ -46,15 +49,22 @@ func (*llama2Text7b) GetInferenceParameters() *model.PresetParam {
 		GPUCountRequirement:       "1",
 		TotalGPUMemoryRequirement: "14Gi",
 		PerGPUMemoryRequirement:   "14Gi", // We run llama2 using tensor parallelism, the memory of each GPU needs to be bigger than the tensor shard size.
-		TorchRunParams:            inference.DefaultTorchRunParams,
-		TorchRunRdzvParams:        inference.DefaultTorchRunRdzvParams,
-		ModelRunParams:            llamaRunParams,
-		ReadinessTimeout:          time.Duration(10) * time.Minute,
-		BaseCommand:               baseCommandPresetLlama,
-		WorldSize:                 1,
+		BackendParam: model.BackendParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand:        baseCommandPresetLlama,
+				TorchRunParams:     inference.DefaultTorchRunParams,
+				TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams,
+				ModelRunParams:     llamaRunParams,
+			},
+			VLLM: model.VLLMParam{
+				BaseCommand:    "python3",
+				ModelRunParams: llamaRunParamsVLLM,
+			},
+		},
+		ReadinessTimeout: time.Duration(10) * time.Minute,
+		WorldSize:        1,
 		// Tag:  llama has private image access mode. The image tag is determined by the user.
 	}
-
 }
 func (*llama2Text7b) GetTuningParameters() *model.PresetParam {
 	return nil // Currently doesn't support fine-tuning
@@ -78,12 +88,20 @@ func (*llama2Text13b) GetInferenceParameters() *model.PresetParam {
 		GPUCountRequirement:       "2",
 		TotalGPUMemoryRequirement: "30Gi",
 		PerGPUMemoryRequirement:   "15Gi", // We run llama2 using tensor parallelism, the memory of each GPU needs to be bigger than the tensor shard size.
-		TorchRunParams:            inference.DefaultTorchRunParams,
-		TorchRunRdzvParams:        inference.DefaultTorchRunRdzvParams,
-		ModelRunParams:            llamaRunParams,
-		ReadinessTimeout:          time.Duration(20) * time.Minute,
-		BaseCommand:               baseCommandPresetLlama,
-		WorldSize:                 2,
+		BackendParam: model.BackendParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand:        baseCommandPresetLlama,
+				TorchRunParams:     inference.DefaultTorchRunParams,
+				TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams,
+				ModelRunParams:     llamaRunParams,
+			},
+			VLLM: model.VLLMParam{
+				BaseCommand:    "python3",
+				ModelRunParams: llamaRunParamsVLLM,
+			},
+		},
+		ReadinessTimeout: time.Duration(20) * time.Minute,
+		WorldSize:        2,
 		// Tag:  llama has private image access mode. The image tag is determined by the user.
 	}
 }
@@ -109,12 +127,20 @@ func (*llama2Text70b) GetInferenceParameters() *model.PresetParam {
 		GPUCountRequirement:       "8",
 		TotalGPUMemoryRequirement: "152Gi",
 		PerGPUMemoryRequirement:   "19Gi", // We run llama2 using tensor parallelism, the memory of each GPU needs to be bigger than the tensor shard size.
-		TorchRunParams:            inference.DefaultTorchRunParams,
-		TorchRunRdzvParams:        inference.DefaultTorchRunRdzvParams,
-		ModelRunParams:            llamaRunParams,
-		ReadinessTimeout:          time.Duration(30) * time.Minute,
-		BaseCommand:               baseCommandPresetLlama,
-		WorldSize:                 8,
+		BackendParam: model.BackendParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand:        baseCommandPresetLlama,
+				TorchRunParams:     inference.DefaultTorchRunParams,
+				TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams,
+				ModelRunParams:     llamaRunParams,
+			},
+			VLLM: model.VLLMParam{
+				BaseCommand:    "python3",
+				ModelRunParams: llamaRunParamsVLLM,
+			},
+		},
+		ReadinessTimeout: time.Duration(30) * time.Minute,
+		WorldSize:        8,
 		// Tag:  llama has private image access mode. The image tag is determined by the user.
 	}
 }
diff --git a/presets/models/llama2chat/model.go b/presets/models/llama2chat/model.go
index 89225bef5..72aa04f40 100644
--- a/presets/models/llama2chat/model.go
+++ b/presets/models/llama2chat/model.go
@@ -32,6 +32,9 @@ var (
 		"max_seq_len":    "512",
 		"max_batch_size": "8",
 	}
+	llamaRunParamsVLLM = map[string]string{
+		"max-seq-len-to-capture": "512",
+	}
 )
 
 var llama2chatA llama2Chat7b
@@ -46,12 +49,20 @@ func (*llama2Chat7b) GetInferenceParameters() *model.PresetParam {
 		GPUCountRequirement:       "1",
 		TotalGPUMemoryRequirement: "16Gi",
 		PerGPUMemoryRequirement:   "14Gi", // We run llama2 using tensor parallelism, the memory of each GPU needs to be bigger than the tensor shard size.
-		TorchRunParams:            inference.DefaultTorchRunParams,
-		TorchRunRdzvParams:        inference.DefaultTorchRunRdzvParams,
-		ModelRunParams:            llamaRunParams,
-		ReadinessTimeout:          time.Duration(10) * time.Minute,
-		BaseCommand:               baseCommandPresetLlama,
-		WorldSize:                 1,
+		BackendParam: model.BackendParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand:        baseCommandPresetLlama,
+				TorchRunParams:     inference.DefaultTorchRunParams,
+				TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams,
+				ModelRunParams:     llamaRunParams,
+			},
+			VLLM: model.VLLMParam{
+				BaseCommand:    "python3",
+				ModelRunParams: llamaRunParamsVLLM,
+			},
+		},
+		ReadinessTimeout: time.Duration(10) * time.Minute,
+		WorldSize:        1,
 		// Tag:  llama has private image access mode. The image tag is determined by the user.
 	}
 }
@@ -77,12 +88,21 @@ func (*llama2Chat13b) GetInferenceParameters() *model.PresetParam {
 		GPUCountRequirement:       "2",
 		TotalGPUMemoryRequirement: "30Gi",
 		PerGPUMemoryRequirement:   "15Gi", // We run llama2 using tensor parallelism, the memory of each GPU needs to be bigger than the tensor shard size.
-		TorchRunParams:            inference.DefaultTorchRunParams,
-		TorchRunRdzvParams:        inference.DefaultTorchRunRdzvParams,
-		ModelRunParams:            llamaRunParams,
-		ReadinessTimeout:          time.Duration(20) * time.Minute,
-		BaseCommand:               baseCommandPresetLlama,
-		WorldSize:                 2,
+		BackendParam: model.BackendParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand:        baseCommandPresetLlama,
+				TorchRunParams:     inference.DefaultTorchRunParams,
+				TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams,
+				ModelRunParams:     llamaRunParams,
+			},
+			VLLM: model.VLLMParam{
+				BaseCommand:    "python3",
+				ModelRunParams: llamaRunParamsVLLM,
+			},
+		},
+		ReadinessTimeout: time.Duration(20) * time.Minute,
+
+		WorldSize: 2,
 		// Tag:  llama has private image access mode. The image tag is determined by the user.
 	}
 }
@@ -108,12 +128,20 @@ func (*llama2Chat70b) GetInferenceParameters() *model.PresetParam {
 		GPUCountRequirement:       "8",
 		TotalGPUMemoryRequirement: "192Gi",
 		PerGPUMemoryRequirement:   "19Gi", // We run llama2 using tensor parallelism, the memory of each GPU needs to be bigger than the tensor shard size.
-		TorchRunParams:            inference.DefaultTorchRunParams,
-		TorchRunRdzvParams:        inference.DefaultTorchRunRdzvParams,
-		ModelRunParams:            llamaRunParams,
-		ReadinessTimeout:          time.Duration(30) * time.Minute,
-		BaseCommand:               baseCommandPresetLlama,
-		WorldSize:                 8,
+		BackendParam: model.BackendParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand:        baseCommandPresetLlama,
+				TorchRunParams:     inference.DefaultTorchRunParams,
+				TorchRunRdzvParams: inference.DefaultTorchRunRdzvParams,
+				ModelRunParams:     llamaRunParams,
+			},
+			VLLM: model.VLLMParam{
+				BaseCommand:    "python3",
+				ModelRunParams: llamaRunParamsVLLM,
+			},
+		},
+		ReadinessTimeout: time.Duration(30) * time.Minute,
+		WorldSize:        8,
 		// Tag:  llama has private image access mode. The image tag is determined by the user.
 	}
 }
diff --git a/presets/models/mistral/model.go b/presets/models/mistral/model.go
index b4581d6f1..a7b62e022 100644
--- a/presets/models/mistral/model.go
+++ b/presets/models/mistral/model.go
@@ -36,6 +36,11 @@ var (
 	mistralRunParams                  = map[string]string{
 		"torch_dtype": "bfloat16",
 		"pipeline":    "text-generation",
+		"chat_template": "/workspace/chat_templates/mistral-instruct.jinja",
+	}
+	mistralRunParamsVLLM = map[string]string{
+		"dtype": "bfloat16",
+		"chat-template": "/workspace/chat_templates/mistral-instruct.jinja",
 	}
 )
 
@@ -51,11 +56,19 @@ func (*mistral7b) GetInferenceParameters() *model.PresetParam {
 		GPUCountRequirement:       "1",
 		TotalGPUMemoryRequirement: "14Gi",
 		PerGPUMemoryRequirement:   "0Gi", // We run Mistral using native vertical model parallel, no per GPU memory requirement.
-		TorchRunParams:            inference.DefaultAccelerateParams,
-		ModelRunParams:            mistralRunParams,
-		ReadinessTimeout:          time.Duration(30) * time.Minute,
-		BaseCommand:               baseCommandPresetMistralInference,
-		Tag:                       PresetMistralTagMap["Mistral7B"],
+		BackendParam: model.BackendParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				TorchRunParams: inference.DefaultAccelerateParams,
+				ModelRunParams: mistralRunParams,
+				BaseCommand:    baseCommandPresetMistralInference,
+			},
+			VLLM: model.VLLMParam{
+				BaseCommand:    "python3",
+				ModelRunParams: mistralRunParamsVLLM,
+			},
+		},
+		ReadinessTimeout: time.Duration(30) * time.Minute,
+		Tag:              PresetMistralTagMap["Mistral7B"],
 	}
 
 }
@@ -67,10 +80,14 @@ func (*mistral7b) GetTuningParameters() *model.PresetParam {
 		GPUCountRequirement:       "1",
 		TotalGPUMemoryRequirement: "16Gi",
 		PerGPUMemoryRequirement:   "16Gi",
-		//TorchRunParams:            tuning.DefaultAccelerateParams,
-		//ModelRunParams:            mistralRunParams,
+		BackendParam: model.BackendParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				//TorchRunParams:            tuning.DefaultAccelerateParams,
+				//ModelRunParams:            mistralRunParams,
+				BaseCommand: baseCommandPresetMistralTuning,
+			},
+		},
 		ReadinessTimeout: time.Duration(30) * time.Minute,
-		BaseCommand:      baseCommandPresetMistralTuning,
 		Tag:              PresetMistralTagMap["Mistral7B"],
 	}
 }
@@ -94,11 +111,19 @@ func (*mistral7bInst) GetInferenceParameters() *model.PresetParam {
 		GPUCountRequirement:       "1",
 		TotalGPUMemoryRequirement: "16Gi",
 		PerGPUMemoryRequirement:   "0Gi", // We run mistral using native vertical model parallel, no per GPU memory requirement.
-		TorchRunParams:            inference.DefaultAccelerateParams,
-		ModelRunParams:            mistralRunParams,
-		ReadinessTimeout:          time.Duration(30) * time.Minute,
-		BaseCommand:               baseCommandPresetMistralInference,
-		Tag:                       PresetMistralTagMap["Mistral7BInstruct"],
+		BackendParam: model.BackendParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				TorchRunParams: inference.DefaultAccelerateParams,
+				ModelRunParams: mistralRunParams,
+				BaseCommand:    baseCommandPresetMistralInference,
+			},
+			VLLM: model.VLLMParam{
+				BaseCommand:    "python3",
+				ModelRunParams: mistralRunParamsVLLM,
+			},
+		},
+		ReadinessTimeout: time.Duration(30) * time.Minute,
+		Tag:              PresetMistralTagMap["Mistral7BInstruct"],
 	}
 
 }
diff --git a/presets/models/phi2/model.go b/presets/models/phi2/model.go
index 07fb8e0d2..77cc5c46b 100644
--- a/presets/models/phi2/model.go
+++ b/presets/models/phi2/model.go
@@ -31,6 +31,9 @@ var (
 		"torch_dtype": "float16",
 		"pipeline":    "text-generation",
 	}
+	phiRunParamsVLLM = map[string]string{
+		"dtype": "float16",
+	}
 )
 
 var phiA phi2
@@ -45,11 +48,19 @@ func (*phi2) GetInferenceParameters() *model.PresetParam {
 		GPUCountRequirement:       "1",
 		TotalGPUMemoryRequirement: "12Gi",
 		PerGPUMemoryRequirement:   "0Gi", // We run Phi using native vertical model parallel, no per GPU memory requirement.
-		TorchRunParams:            inference.DefaultAccelerateParams,
-		ModelRunParams:            phiRunParams,
-		ReadinessTimeout:          time.Duration(30) * time.Minute,
-		BaseCommand:               baseCommandPresetPhiInference,
-		Tag:                       PresetPhiTagMap["Phi2"],
+		BackendParam: model.BackendParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				TorchRunParams: inference.DefaultAccelerateParams,
+				ModelRunParams: phiRunParams,
+				BaseCommand:    baseCommandPresetPhiInference,
+			},
+			VLLM: model.VLLMParam{
+				BaseCommand:    "python3",
+				ModelRunParams: phiRunParamsVLLM,
+			},
+		},
+		ReadinessTimeout: time.Duration(30) * time.Minute,
+		Tag:              PresetPhiTagMap["Phi2"],
 	}
 }
 func (*phi2) GetTuningParameters() *model.PresetParam {
@@ -60,10 +71,14 @@ func (*phi2) GetTuningParameters() *model.PresetParam {
 		GPUCountRequirement:       "1",
 		TotalGPUMemoryRequirement: "16Gi",
 		PerGPUMemoryRequirement:   "16Gi",
-		// TorchRunParams:            inference.DefaultAccelerateParams,
-		// ModelRunParams:            phiRunParams,
+		BackendParam: model.BackendParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				// TorchRunParams:            inference.DefaultAccelerateParams,
+				// ModelRunParams:            phiRunParams,
+				BaseCommand: baseCommandPresetPhiTuning,
+			},
+		},
 		ReadinessTimeout: time.Duration(30) * time.Minute,
-		BaseCommand:      baseCommandPresetPhiTuning,
 		Tag:              PresetPhiTagMap["Phi2"],
 	}
 }
diff --git a/presets/models/phi3/model.go b/presets/models/phi3/model.go
index 5656fc15a..4f2218495 100644
--- a/presets/models/phi3/model.go
+++ b/presets/models/phi3/model.go
@@ -50,6 +50,9 @@ var (
 		"pipeline":          "text-generation",
 		"trust_remote_code": "",
 	}
+	phiRunParamsVLLM = map[string]string{
+		"dtype": "auto",
+	}
 )
 
 var phi3MiniA phi3Mini4KInst
@@ -64,11 +67,19 @@ func (*phi3Mini4KInst) GetInferenceParameters() *model.PresetParam {
 		GPUCountRequirement:       "1",
 		TotalGPUMemoryRequirement: "9Gi",
 		PerGPUMemoryRequirement:   "0Gi", // We run Phi using native vertical model parallel, no per GPU memory requirement.
-		TorchRunParams:            inference.DefaultAccelerateParams,
-		ModelRunParams:            phiRunParams,
-		ReadinessTimeout:          time.Duration(30) * time.Minute,
-		BaseCommand:               baseCommandPresetPhiInference,
-		Tag:                       PresetPhiTagMap["Phi3Mini4kInstruct"],
+		BackendParam: model.BackendParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand:    baseCommandPresetPhiInference,
+				TorchRunParams: inference.DefaultAccelerateParams,
+				ModelRunParams: phiRunParams,
+			},
+			VLLM: model.VLLMParam{
+				BaseCommand:    "python3",
+				ModelRunParams: phiRunParamsVLLM,
+			},
+		},
+		ReadinessTimeout: time.Duration(30) * time.Minute,
+		Tag:              PresetPhiTagMap["Phi3Mini4kInstruct"],
 	}
 }
 func (*phi3Mini4KInst) GetTuningParameters() *model.PresetParam {
@@ -82,8 +93,12 @@ func (*phi3Mini4KInst) GetTuningParameters() *model.PresetParam {
 		// TorchRunParams:            inference.DefaultAccelerateParams,
 		// ModelRunParams:            phiRunParams,
 		ReadinessTimeout: time.Duration(30) * time.Minute,
-		BaseCommand:      baseCommandPresetPhiTuning,
-		Tag:              PresetPhiTagMap["Phi3Mini4kInstruct"],
+		BackendParam: model.BackendParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand: baseCommandPresetPhiTuning,
+			},
+		},
+		Tag: PresetPhiTagMap["Phi3Mini4kInstruct"],
 	}
 }
 func (*phi3Mini4KInst) SupportDistributedInference() bool { return false }
@@ -103,11 +118,19 @@ func (*phi3Mini128KInst) GetInferenceParameters() *model.PresetParam {
 		GPUCountRequirement:       "1",
 		TotalGPUMemoryRequirement: "9Gi",
 		PerGPUMemoryRequirement:   "0Gi", // We run Phi using native vertical model parallel, no per GPU memory requirement.
-		TorchRunParams:            inference.DefaultAccelerateParams,
-		ModelRunParams:            phiRunParams,
-		ReadinessTimeout:          time.Duration(30) * time.Minute,
-		BaseCommand:               baseCommandPresetPhiInference,
-		Tag:                       PresetPhiTagMap["Phi3Mini128kInstruct"],
+		BackendParam: model.BackendParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand:    baseCommandPresetPhiInference,
+				TorchRunParams: inference.DefaultAccelerateParams,
+				ModelRunParams: phiRunParams,
+			},
+			VLLM: model.VLLMParam{
+				BaseCommand:    "python3",
+				ModelRunParams: phiRunParamsVLLM,
+			},
+		},
+		ReadinessTimeout: time.Duration(30) * time.Minute,
+		Tag:              PresetPhiTagMap["Phi3Mini128kInstruct"],
 	}
 }
 func (*phi3Mini128KInst) GetTuningParameters() *model.PresetParam {
@@ -118,11 +141,17 @@ func (*phi3Mini128KInst) GetTuningParameters() *model.PresetParam {
 		GPUCountRequirement:       "1",
 		TotalGPUMemoryRequirement: "72Gi",
 		PerGPUMemoryRequirement:   "72Gi",
-		// TorchRunParams:            inference.DefaultAccelerateParams,
-		// ModelRunParams:            phiRunParams,
-		ReadinessTimeout: time.Duration(30) * time.Minute,
-		BaseCommand:      baseCommandPresetPhiTuning,
-		Tag:              PresetPhiTagMap["Phi3Mini128kInstruct"],
+		ReadinessTimeout:          time.Duration(30) * time.Minute,
+		BackendParam: model.BackendParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand: baseCommandPresetPhiTuning,
+			},
+			VLLM: model.VLLMParam{
+				BaseCommand:    "python3",
+				ModelRunParams: phiRunParamsVLLM,
+			},
+		},
+		Tag: PresetPhiTagMap["Phi3Mini128kInstruct"],
 	}
 }
 func (*phi3Mini128KInst) SupportDistributedInference() bool { return false }
@@ -142,11 +171,19 @@ func (*Phi3Medium4kInstruct) GetInferenceParameters() *model.PresetParam {
 		GPUCountRequirement:       "1",
 		TotalGPUMemoryRequirement: "28Gi",
 		PerGPUMemoryRequirement:   "0Gi", // We run Phi using native vertical model parallel, no per GPU memory requirement.
-		TorchRunParams:            inference.DefaultAccelerateParams,
-		ModelRunParams:            phiRunParams,
-		ReadinessTimeout:          time.Duration(30) * time.Minute,
-		BaseCommand:               baseCommandPresetPhiInference,
-		Tag:                       PresetPhiTagMap["Phi3Medium4kInstruct"],
+		BackendParam: model.BackendParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand:    baseCommandPresetPhiInference,
+				TorchRunParams: inference.DefaultAccelerateParams,
+				ModelRunParams: phiRunParams,
+			},
+			VLLM: model.VLLMParam{
+				BaseCommand:    "python3",
+				ModelRunParams: phiRunParamsVLLM,
+			},
+		},
+		ReadinessTimeout: time.Duration(30) * time.Minute,
+		Tag:              PresetPhiTagMap["Phi3Medium4kInstruct"],
 	}
 }
 func (*Phi3Medium4kInstruct) GetTuningParameters() *model.PresetParam {
@@ -160,8 +197,12 @@ func (*Phi3Medium4kInstruct) GetTuningParameters() *model.PresetParam {
 		// TorchRunParams:            inference.DefaultAccelerateParams,
 		// ModelRunParams:            phiRunParams,
 		ReadinessTimeout: time.Duration(30) * time.Minute,
-		BaseCommand:      baseCommandPresetPhiTuning,
-		Tag:              PresetPhiTagMap["Phi3Medium4kInstruct"],
+		BackendParam: model.BackendParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand: baseCommandPresetPhiTuning,
+			},
+		},
+		Tag: PresetPhiTagMap["Phi3Medium4kInstruct"],
 	}
 }
 func (*Phi3Medium4kInstruct) SupportDistributedInference() bool { return false }
@@ -181,11 +222,15 @@ func (*Phi3Medium128kInstruct) GetInferenceParameters() *model.PresetParam {
 		GPUCountRequirement:       "1",
 		TotalGPUMemoryRequirement: "28Gi",
 		PerGPUMemoryRequirement:   "0Gi", // We run Phi using native vertical model parallel, no per GPU memory requirement.
-		TorchRunParams:            inference.DefaultAccelerateParams,
-		ModelRunParams:            phiRunParams,
-		ReadinessTimeout:          time.Duration(30) * time.Minute,
-		BaseCommand:               baseCommandPresetPhiInference,
-		Tag:                       PresetPhiTagMap["Phi3Medium128kInstruct"],
+		BackendParam: model.BackendParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand:    baseCommandPresetPhiInference,
+				TorchRunParams: inference.DefaultAccelerateParams,
+				ModelRunParams: phiRunParams,
+			},
+		},
+		ReadinessTimeout: time.Duration(30) * time.Minute,
+		Tag:              PresetPhiTagMap["Phi3Medium128kInstruct"],
 	}
 }
 func (*Phi3Medium128kInstruct) GetTuningParameters() *model.PresetParam {
@@ -196,11 +241,13 @@ func (*Phi3Medium128kInstruct) GetTuningParameters() *model.PresetParam {
 		GPUCountRequirement:       "1",
 		TotalGPUMemoryRequirement: "80Gi",
 		PerGPUMemoryRequirement:   "80Gi",
-		// TorchRunParams:            inference.DefaultAccelerateParams,
-		// ModelRunParams:            phiRunParams,
-		ReadinessTimeout: time.Duration(30) * time.Minute,
-		BaseCommand:      baseCommandPresetPhiTuning,
-		Tag:              PresetPhiTagMap["Phi3Medium128kInstruct"],
+		ReadinessTimeout:          time.Duration(30) * time.Minute,
+		BackendParam: model.BackendParam{
+			Transformers: model.HuggingfaceTransformersParam{
+				BaseCommand: baseCommandPresetPhiTuning,
+			},
+		},
+		Tag: PresetPhiTagMap["Phi3Medium128kInstruct"],
 	}
 }
 func (*Phi3Medium128kInstruct) SupportDistributedInference() bool { return false }
diff --git a/presets/models/supported_models.yaml b/presets/models/supported_models.yaml
index be25c83f5..d82eb5582 100644
--- a/presets/models/supported_models.yaml
+++ b/presets/models/supported_models.yaml
@@ -104,8 +104,9 @@ models:
     type: text-generation 
     version: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/commit/d269012bea6fbe38ce7752c8940fea010eea3383
     runtime: tfs
-    tag: 0.0.2
+    tag: 0.0.3
     # Tag history:
+    # 0.0.3 - Add vllm inference backend
     # 0.0.2 - Add Logging & Metrics Server
     # 0.0.1 - Initial Release
 
diff --git a/presets/tuning/text-generation/requirements.txt b/presets/tuning/text-generation/requirements.txt
index e2aeb3098..3da067c95 100644
--- a/presets/tuning/text-generation/requirements.txt
+++ b/presets/tuning/text-generation/requirements.txt
@@ -1,6 +1,6 @@
 # Core Dependencies
-transformers==4.41.2
-torch==2.2.0
+transformers >= 4.45.0
+torch==2.4.0
 accelerate==0.30.1
 fastapi>=0.111.0,<0.112.0  # Allow patch updates
 pydantic==2.7.4