From 4e268e54908dbc2579d3dfab162cf644b2fc3d59 Mon Sep 17 00:00:00 2001
From: Dariusz Trawinski <Dariusz.Trawinski@intel.com>
Date: Mon, 23 Dec 2024 17:28:51 +0100
Subject: [PATCH 1/4] fix handling device name with brackets

---
 src/ov_utils.cpp           |  5 +++++
 src/test/ov_utils_test.cpp | 11 +++++++++++
 2 files changed, 16 insertions(+)

diff --git a/src/ov_utils.cpp b/src/ov_utils.cpp
index 78a70b4bca..d71be1dbe1 100644
--- a/src/ov_utils.cpp
+++ b/src/ov_utils.cpp
@@ -122,6 +122,11 @@ Status validatePluginConfiguration(const plugin_config_t& pluginConfig, const st
         std::string deviceName;
 
         while (getline(ss, deviceName, deviceDelimiter)) {
+            char bracket = '(';
+            auto bracketPos = deviceName.find(bracket);
+            if (bracketPos != std::string::npos) {
+                deviceName = deviceName.substr(0, bracketPos);
+            }
             insertSupportedKeys(pluginSupportedConfigKeys, deviceName, ieCore);
         }
     } else {
diff --git a/src/test/ov_utils_test.cpp b/src/test/ov_utils_test.cpp
index 144ea1bc04..3ebe63ee80 100644
--- a/src/test/ov_utils_test.cpp
+++ b/src/test/ov_utils_test.cpp
@@ -203,6 +203,17 @@ TEST(OVUtils, ValidatePluginConfigurationPositive) {
     EXPECT_TRUE(status.ok());
 }
 
+TEST(OVUtils, ValidatePluginConfigurationPositiveBatch) {
+    ov::Core ieCore;
+    std::shared_ptr<ov::Model> model = ieCore.read_model(std::filesystem::current_path().u8string() + "/src/test/dummy/1/dummy.xml");
+    ovms::ModelConfig config;
+    config.setTargetDevice("BATCH:CPU(4)");
+    config.setPluginConfig({{"AUTO_BATCH_TIMEOUT", 10}});
+    ovms::plugin_config_t supportedPluginConfig = ovms::ModelInstance::prepareDefaultPluginConfig(config);
+    auto status = ovms::validatePluginConfiguration(supportedPluginConfig, "BATCH:CPU(4)", ieCore);
+    EXPECT_TRUE(status.ok());
+}
+
 TEST(OVUtils, ValidatePluginConfigurationNegative) {
     ov::Core ieCore;
     std::shared_ptr<ov::Model> model = ieCore.read_model(std::filesystem::current_path().u8string() + "/src/test/dummy/1/dummy.xml");

From f78950b04ec4ce7117976315e662c28e613c89fc Mon Sep 17 00:00:00 2001
From: Trawinski <dariusz.trawinski@intel.com>
Date: Mon, 23 Dec 2024 17:31:19 +0100
Subject: [PATCH 2/4] update documentation about accelerators

---
 docs/accelerators.md | 105 +++++++++++++------------------------------
 1 file changed, 31 insertions(+), 74 deletions(-)

diff --git a/docs/accelerators.md b/docs/accelerators.md
index c1493b8be2..cd3c3d378d 100644
--- a/docs/accelerators.md
+++ b/docs/accelerators.md
@@ -47,53 +47,30 @@ docker run --rm -it  --device=/dev/dxg --volume /usr/lib/wsl:/usr/lib/wsl -u $(i
 --model_path /opt/model --model_name resnet --port 9001 --target_device GPU
 ```
 
-> **NOTE**:
-> The public docker image includes the OpenCL drivers for GPU in version 22.28 (RedHat) and 22.35 (Ubuntu).
 
-If you need to build the OpenVINO Model Server with different driver version, refer to the [building from sources](https://github.com/openvinotoolkit/model_server/blob/main/docs/build_from_source.md)
 
-## Using Multi-Device Plugin
-
-If you have multiple inference devices available (e.g. GPU, CPU, and NPU) you can increase inference throughput by enabling the Multi-Device Plugin.
-It distributes Inference requests among multiple devices, balancing out the load. For more detailed information read OpenVINO’s [Multi-Device plugin documentation](https://docs.openvino.ai/2024/documentation/legacy-features/multi-device.html) documentation.
-
-To use this feature in OpenVINO Model Server, you can choose one of two ways:
+## Using NPU device Plugin
 
-1. Use a .json configuration file to set the `--target_device` parameter with the pattern of: `MULTI:<DEVICE_1>,<DEVICE_2>`.
-The order of the devices will define their priority, in this case making `device_1` the primary selection.
+OpenVINO Model Server can support using [NPU device](https://docs.openvino.ai/canonical/openvino_docs_install_guides_configurations_for_intel_npu.html)
 
-This example of a config.json file sets up the Multi-Device Plugin for a resnet model, using GPU and CPU as devices:
+Docker image with required dependencies can be build using this procedure:
+The docker image of OpenVINO Model Server including support for NVIDIA can be built from sources
 
 ```bash
-echo '{"model_config_list": [
-   {"config": {
-      "name": "resnet",
-      "base_path": "/opt/model",
-      "batch_size": "1",
-      "target_device": "MULTI:GPU,CPU"}
-   }]
-}' >> models/public/resnet-50-tf/config.json
+git clone https://github.com/openvinotoolkit/model_server.git
+cd model_server
+make release_image NPU=1
+cd ..
 ```
 
-To start OpenVINO Model Server, with the described config file placed as `./models/config.json`, set the `grpc_workers` parameter to match the `nireq` field in config.json
-and use the run command, like so:
-
+Example command to run container with NPU:
 ```bash
-docker run -d --rm --device=/dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) \
--u $(id -u):$(id -g) -v ${PWD}/models/public/resnet-50-tf/:/opt/model:ro -p 9001:9001 \
-openvino/model_server:latest-gpu --config_path /opt/model/config.json --port 9001
+docker run --device /dev/accel -p 9000:9000 --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
+-v ${PWD}/models/public/resnet-50-tf:/opt/model openvino/model_server:latest --model_path /opt/model --model_name resnet --port 9000 --target_device NPU
 ```
+Check more info about the [NPU driver for Linux](https://github.com/intel/linux-npu-driver).
 
-2. When using just a single model, you can start OpenVINO Model Server without the config.json file. To do so, use the run command together with additional parameters, like so:
 
-```bash
-docker run -d --rm --device=/dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) \
--u $(id -u):$(id -g) -v ${PWD}/models/public/resnet-50-tf/:/opt/model:ro -p 9001:9001 \
-openvino/model_server:latest-gpu --model_path /opt/model --model_name resnet --port 9001 --target_device 'MULTI:GPU,CPU'
-```
-
-The deployed model will perform inference on both GPU and CPU.
-The total throughput will be roughly equal to the sum of GPU and CPU throughput.
 
 ## Using Heterogeneous Plugin
 
@@ -135,6 +112,8 @@ docker run --rm -d --device=/dev/dri --group-add=$(stat -c "%g" /dev/dri/render*
 ```
 
 The `Auto Device` plugin can also use the [PERFORMANCE_HINT](performance_tuning.md) plugin config property that enables you to specify a performance mode for the plugin.
+While LATENCY and THROUGHPUT hint can select one target device with your preferred performance option, the CUMULATIVE_THROUGHPUT option enables running inference on multiple devices for higher throughput. 
+With CUMULATIVE_THROUGHPUT, AUTO loads the network model to all available devices (specified by AUTO) in the candidate list, and then runs inference on them based on the default or specified priority.
 
 > **NOTE**: NUM_STREAMS and PERFORMANCE_HINT should not be used together.
 
@@ -160,52 +139,30 @@ docker run --rm -d --device=/dev/dri --group-add=$(stat -c "%g" /dev/dri/render*
 --target_device AUTO
 ```
 
-> **NOTE**: currently, AUTO plugin cannot be used with `--shape auto` parameter while GPU device is enabled.
-
-## Using NVIDIA Plugin
-
-OpenVINO Model Server can be used also with NVIDIA GPU cards by using NVIDIA plugin from the [github repo openvino_contrib](https://github.com/openvinotoolkit/openvino_contrib/tree/master/modules/nvidia_plugin).
-The docker image of OpenVINO Model Server including support for NVIDIA can be built from sources
-
-```bash
-git clone https://github.com/openvinotoolkit/model_server.git
-cd model_server
-make docker_build NVIDIA=1 OV_USE_BINARY=0
-cd ..
-```
-Check also [building from sources](https://github.com/openvinotoolkit/model_server/blob/main/docs/build_from_source.md).
-
-Example command to run container with NVIDIA support:
-
-```bash
-docker run -it --gpus all -p 9000:9000 -v ${PWD}/models/public/resnet-50-tf:/opt/model openvino/model_server:latest-cuda --model_path /opt/model --model_name resnet --port 9000 --target_device NVIDIA
-```
+CUMULATIVE_THROUGHPUT
 
-For models with layers not supported on NVIDIA plugin, you can use a virtual plugin `HETERO` which can use multiple devices listed after the colon:
 ```bash
-docker run -it --gpus all -p 9000:9000 -v ${PWD}/models/public/resnet-50-tf:/opt/model openvino/model_server:latest-cuda --model_path /opt/model --model_name resnet --port 9000 --target_device HETERO:NVIDIA,CPU
+docker run --rm -d --device=/dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
+-v ${PWD}/models/public/resnet-50-tf:/opt/model -p 9001:9001 openvino/model_server:latest-gpu \
+--model_path /opt/model --model_name resnet --port 9001 \
+--plugin_config '{"PERFORMANCE_HINT": "CUMULATIVE_THROUGHPUT"}' \
+--target_device AUTO:GPU,CPU
 ```
 
-Check the supported [configuration parameters](https://github.com/openvinotoolkit/openvino_contrib/tree/master/modules/nvidia_plugin#supported-configuration-parameters) and [supported layers](https://github.com/openvinotoolkit/openvino_contrib/tree/master/modules/nvidia_plugin#supported-layers-and-limitations)
-
 
-## Using NPU device Plugin
+## Using Automatic Batching Plugin
 
-OpenVINO Model Server can support using [NPU device](https://docs.openvino.ai/canonical/openvino_docs_install_guides_configurations_for_intel_npu.html)
+[Auto Batching](https://docs.openvino.ai/2024/openvino-workflow/running-inference/inference-devices-and-modes/automatic-batching.html) (or BATCH in short) is a new special “virtual” device 
+which explicitly defines the auto batching. 
 
-Docker image with required dependencies can be build using this procedure:
-The docker image of OpenVINO Model Server including support for NVIDIA can be built from sources
+It performs automatic batching on-the-fly to improve device utilization by grouping inference requests together, without programming effort from the user. 
+With Automatic Batching, gathering the input and scattering the output from the individual inference requests required for the batch happen transparently, without affecting the application code.
 
-```bash
-git clone https://github.com/openvinotoolkit/model_server.git
-cd model_server
-make release_image NPU=1
-cd ..
-```
+> **NOTE**: Autobatching can be applied only for static models
 
-Example command to run container with NPU:
 ```bash
-docker run --device /dev/accel -p 9000:9000 --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
--v ${PWD}/models/public/resnet-50-tf:/opt/model openvino/model_server:latest --model_path /opt/model --model_name resnet --port 9000 --target_device NPU
-```
-Check more info about the [NPU driver for Linux](https://github.com/intel/linux-npu-driver).
\ No newline at end of file
+docker run -v ${PWD}/models/public/resnet-50-tf:/opt/model -p 9001:9001 openvino/model_server:latest-gpu \
+--model_path /opt/model --model_name resnet --port 9001 \
+--plugin_config '{"AUTO_BATCH_TIMEOUT": 200}' \
+--target_device BATCH:CPU(16)
+```
\ No newline at end of file

From df0f668d602e3d1dc9120c011ba505da907e3084 Mon Sep 17 00:00:00 2001
From: Dariusz Trawinski <dariusz.trainski@intel.com>
Date: Sun, 12 Jan 2025 01:06:15 +0100
Subject: [PATCH 3/4] updates

---
 docs/accelerators.md | 31 +++++++++++--------------------
 1 file changed, 11 insertions(+), 20 deletions(-)

diff --git a/docs/accelerators.md b/docs/accelerators.md
index cd3c3d378d..f4edfadf61 100644
--- a/docs/accelerators.md
+++ b/docs/accelerators.md
@@ -13,9 +13,8 @@ mv ${PWD}/models/public/resnet-50-tf/FP32 ${PWD}/models/public/resnet-50-tf/1
 
 ## Starting a Docker Container with Intel integrated GPU, Intel® Data Center GPU Flex Series and Intel® Arc™ GPU
 
-The [GPU plugin](https://docs.openvino.ai/2024/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.html) uses the Intel Compute Library for
-Deep Neural Networks ([clDNN](https://01.org/cldnn)) to infer deep neural networks. For inference execution, it employs Intel® Processor Graphics including
-Intel® HD Graphics, Intel® Iris® Graphics, Intel® Iris® Xe Graphics, and Intel® Iris® Xe MAX graphics.
+The [GPU plugin](https://docs.openvino.ai/2024/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.html) uses the [oneDNN](https://github.com/oneapi-src/oneDNN) and [OpenCL](https://github.com/KhronosGroup/OpenCL-SDK) to infer deep neural networks. For inference execution, it employs Intel® Processor Graphics including
+Intel® Arc™ GPU Series, Intel® UHD Graphics, Intel® HD Graphics, Intel® Iris® Graphics, Intel® Iris® Xe Graphics, and Intel® Iris® Xe MAX graphics and Intel® Data Center GPU.
 
 
 Before using GPU as OpenVINO Model Server target device, you need to:
@@ -30,7 +29,7 @@ Running inference on GPU requires the model server process security context acco
 stat -c "group_name=%G group_id=%g" /dev/dri/render*
 ```
 
-The default account in the docker image is preconfigured. If you change the security context, use the following command to start the model server container:
+Use the following command to start the model server container:
 
 ```bash
 docker run --rm -it  --device=/dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
@@ -48,27 +47,16 @@ docker run --rm -it  --device=/dev/dxg --volume /usr/lib/wsl:/usr/lib/wsl -u $(i
 ```
 
 
-
 ## Using NPU device Plugin
 
-OpenVINO Model Server can support using [NPU device](https://docs.openvino.ai/canonical/openvino_docs_install_guides_configurations_for_intel_npu.html)
-
-Docker image with required dependencies can be build using this procedure:
-The docker image of OpenVINO Model Server including support for NVIDIA can be built from sources
-
-```bash
-git clone https://github.com/openvinotoolkit/model_server.git
-cd model_server
-make release_image NPU=1
-cd ..
-```
+OpenVINO Model Server can support using [NPU device](https://docs.openvino.ai/2024/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.html)
 
 Example command to run container with NPU:
 ```bash
 docker run --device /dev/accel -p 9000:9000 --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
--v ${PWD}/models/public/resnet-50-tf:/opt/model openvino/model_server:latest --model_path /opt/model --model_name resnet --port 9000 --target_device NPU
+-v ${PWD}/models/public/resnet-50-tf:/opt/model openvino/model_server:latest-gpu --model_path /opt/model --model_name resnet --port 9000 --target_device NPU
 ```
-Check more info about the [NPU driver for Linux](https://github.com/intel/linux-npu-driver).
+Check more info about the [NPU driver configuration](https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-npu.html).
 
 
 
@@ -161,8 +149,11 @@ With Automatic Batching, gathering the input and scattering the output from the
 > **NOTE**: Autobatching can be applied only for static models
 
 ```bash
-docker run -v ${PWD}/models/public/resnet-50-tf:/opt/model -p 9001:9001 openvino/model_server:latest-gpu \
+docker run -v ${PWD}/models/public/resnet-50-tf:/opt/model -p 9001:9001 openvino/model_server:latest \
 --model_path /opt/model --model_name resnet --port 9001 \
 --plugin_config '{"AUTO_BATCH_TIMEOUT": 200}' \
 --target_device BATCH:CPU(16)
-```
\ No newline at end of file
+```
+
+In the example above, there will be 200ms timeout to wait for filling the batch size up to 16.
+Note, that autobatching is enabled by default, then the `target_device` is set to `GPU` with `--plugin_config '{"PERFORMANCE_HINT": "THROUGHPUT"}'`. 
\ No newline at end of file

From c1541c1d503b25c4584c6471c7d082adfe0cfc84 Mon Sep 17 00:00:00 2001
From: "Trawinski, Dariusz" <dariusz.trawinski@intel.com>
Date: Tue, 14 Jan 2025 11:08:19 +0100
Subject: [PATCH 4/4] Apply suggestions from code review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Miłosz Żeglarski <milosz.zeglarski@intel.com>
---
 docs/accelerators.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/accelerators.md b/docs/accelerators.md
index f4edfadf61..bb7a0f135d 100644
--- a/docs/accelerators.md
+++ b/docs/accelerators.md
@@ -49,7 +49,7 @@ docker run --rm -it  --device=/dev/dxg --volume /usr/lib/wsl:/usr/lib/wsl -u $(i
 
 ## Using NPU device Plugin
 
-OpenVINO Model Server can support using [NPU device](https://docs.openvino.ai/2024/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.html)
+OpenVINO Model Server supports using [NPU device](https://docs.openvino.ai/2024/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.html)
 
 Example command to run container with NPU:
 ```bash
@@ -101,7 +101,7 @@ docker run --rm -d --device=/dev/dri --group-add=$(stat -c "%g" /dev/dri/render*
 
 The `Auto Device` plugin can also use the [PERFORMANCE_HINT](performance_tuning.md) plugin config property that enables you to specify a performance mode for the plugin.
 While LATENCY and THROUGHPUT hint can select one target device with your preferred performance option, the CUMULATIVE_THROUGHPUT option enables running inference on multiple devices for higher throughput. 
-With CUMULATIVE_THROUGHPUT, AUTO loads the network model to all available devices (specified by AUTO) in the candidate list, and then runs inference on them based on the default or specified priority.
+With CUMULATIVE_THROUGHPUT hint, AUTO plugin loads the network model to all available devices (specified by the plugin) in the candidate list, and then runs inference on them based on the default or specified priority.
 
 > **NOTE**: NUM_STREAMS and PERFORMANCE_HINT should not be used together.