diff --git a/src/main/cpp/profiler/CMakeLists.txt b/src/main/cpp/profiler/CMakeLists.txt
index 03a552b3ea..915faca486 100644
--- a/src/main/cpp/profiler/CMakeLists.txt
+++ b/src/main/cpp/profiler/CMakeLists.txt
@@ -77,6 +77,11 @@ configure_file(
 
 add_executable(spark_rapids_profile_converter
   spark_rapids_profile_converter.cpp
+  initialize_nvtxw.cpp
+  nvtxw3.cpp
+  nvtxw3.h
+  NvtxwEvents.cpp
+  NvtxwEvents.h
   "${SPARK_RAPIDS_JNI_GENERATED_SOURCE_DIR}/profiler_schema.cpp"
   "${SPARK_RAPIDS_JNI_GENERATED_SOURCE_DIR}/spark_rapids_jni_version.cpp"
   "${SPARK_RAPIDS_JNI_GENERATED_INCLUDE_DIR}/profiler_generated.h"
@@ -86,6 +91,8 @@ target_include_directories(
   spark_rapids_profile_converter
   PRIVATE
   "${CUDAToolkit_INCLUDE_DIRS}"
+  "${SPARK_RAPIDS_JNI_SOURCE_DIR}"
+  "${SPARK_RAPIDS_JNI_SOURCE_DIR}/profiler"
   "${SPARK_RAPIDS_JNI_SOURCE_DIR}/src"
   "${SPARK_RAPIDS_JNI_GENERATED_INCLUDE_DIR}"
 )
diff --git a/src/main/cpp/profiler/NvtxwEvents.cpp b/src/main/cpp/profiler/NvtxwEvents.cpp
new file mode 100644
index 0000000000..5cd0873d00
--- /dev/null
+++ b/src/main/cpp/profiler/NvtxwEvents.cpp
@@ -0,0 +1,413 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) <year> NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * 
+ * Licensed under the Apache License v2.0 with LLVM Exceptions.
+ * See LICENSE.txt for license information.
+ */
+
+#include <type_traits>
+
+#include "NvtxwEvents.h"
+
+namespace NvidiaNvtxw
+{
+
+#define PAYLOAD_ENTRY_SIMPLE(flags, type, name) \
+    { (flags), (type), (name), nullptr, 0, 0, nullptr, nullptr }
+
+// The C string containing the event's name must be provided in a special way.
+static const nvtxPayloadSchemaEntry_t nameSchema[] = {
+    PAYLOAD_ENTRY_SIMPLE(
+        NVTX_PAYLOAD_ENTRY_FLAG_EVENT_MESSAGE | NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_ZERO_TERMINATED,
+        NVTX_PAYLOAD_ENTRY_TYPE_CSTRING,
+        "name"
+    )
+};
+static const nvtxPayloadSchemaAttr_t nameSchemaAttr{
+        /*.fieldMask = */
+        NVTX_PAYLOAD_SCHEMA_ATTR_TYPE |
+        NVTX_PAYLOAD_SCHEMA_ATTR_FLAGS |
+        NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES |
+        NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES |
+        NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID,
+        /*.name = */
+        nullptr,
+        /*.type = */
+        NVTX_PAYLOAD_SCHEMA_TYPE_DYNAMIC,
+        /*.flags = */
+        NVTX_PAYLOAD_SCHEMA_FLAG_REFERENCED,
+        /*.entries = */
+        nameSchema,
+        /*.numEntries = */
+        std::extent<decltype(nameSchema)>::value,
+        /*.payloadStaticSize = */
+        0,
+        /*.packAlign = */
+        0,
+        /*.schemaId = */
+        NvidiaNvtxw::PayloadSchemaId::nameId,
+        /*.extension = */
+        nullptr
+    };
+
+static const nvtxPayloadSchemaEntry_t nvtxRangeSchema[] = {
+    PAYLOAD_ENTRY_SIMPLE(
+        NVTX_PAYLOAD_ENTRY_FLAG_RANGE_BEGIN | NVTX_PAYLOAD_ENTRY_FLAG_EVENT_TIMESTAMP,
+        NVTX_PAYLOAD_ENTRY_TYPE_UINT64,
+        "time_start"
+    ),
+    PAYLOAD_ENTRY_SIMPLE(
+        NVTX_PAYLOAD_ENTRY_FLAG_RANGE_END | NVTX_PAYLOAD_ENTRY_FLAG_EVENT_TIMESTAMP,
+        NVTX_PAYLOAD_ENTRY_TYPE_UINT64,
+        "time_stop"
+    ),
+    PAYLOAD_ENTRY_SIMPLE(
+        NVTX_PAYLOAD_ENTRY_FLAG_EVENT_MESSAGE | NVTX_PAYLOAD_ENTRY_FLAG_POINTER,
+        NVTX_PAYLOAD_ENTRY_TYPE_CSTRING,
+        "name"
+    ),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_PID_UINT32, "process_id"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_TID_UINT32, "thread_id"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_COLOR_ARGB, "color"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT8, "push_pop")
+};
+// TimeBase = Relative
+static const nvtxPayloadSchemaAttr_t nvtxRangePushPopSchemaAttr = {
+    NVTX_PAYLOAD_SCHEMA_ATTR_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_TYPE |
+    NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES | NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES |
+    NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE | NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID,
+    "NVTX Range Push Pop Event",
+    NVTX_PAYLOAD_SCHEMA_TYPE_STATIC,
+    NVTX_PAYLOAD_SCHEMA_FLAG_NONE,
+    nvtxRangeSchema,
+    std::extent<decltype(nvtxRangeSchema)>::value,
+    sizeof(struct NvidiaNvtxw::nvtxRangeEvent),
+    0,
+    NvidiaNvtxw::PayloadSchemaId::nvtxRangePushPopId,
+    nullptr
+};
+// TimeBase = Relative
+static const nvtxPayloadSchemaAttr_t nvtxRangeStartEndSchemaAttr = {
+    NVTX_PAYLOAD_SCHEMA_ATTR_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_TYPE |
+    NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES | NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES |
+    NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE | NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID,
+    "NVTX Range Start End Event",
+    NVTX_PAYLOAD_SCHEMA_TYPE_STATIC,
+    NVTX_PAYLOAD_SCHEMA_FLAG_NONE,
+    nvtxRangeSchema,
+    std::extent<decltype(nvtxRangeSchema)>::value,
+    sizeof(struct NvidiaNvtxw::nvtxRangeEvent),
+    0,
+    NvidiaNvtxw::PayloadSchemaId::nvtxRangeStartEndId,
+    nullptr
+};
+
+static const nvtxPayloadSchemaEntry_t cuptiApiSchema[] = {
+    PAYLOAD_ENTRY_SIMPLE(
+        NVTX_PAYLOAD_ENTRY_FLAG_RANGE_BEGIN | NVTX_PAYLOAD_ENTRY_FLAG_EVENT_TIMESTAMP,
+        NVTX_PAYLOAD_ENTRY_TYPE_UINT64,
+        "time_start"
+    ),
+    PAYLOAD_ENTRY_SIMPLE(
+        NVTX_PAYLOAD_ENTRY_FLAG_RANGE_END | NVTX_PAYLOAD_ENTRY_FLAG_EVENT_TIMESTAMP,
+        NVTX_PAYLOAD_ENTRY_TYPE_UINT64,
+        "time_stop"
+    ),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "kind"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "cbid"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_PID_UINT32, "process_id"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_TID_UINT32, "thread_id"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "correlation_id"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "return_value")
+};
+static const nvtxPayloadSchemaAttr_t cuptiApiSchemaAttr = {
+    NVTX_PAYLOAD_SCHEMA_ATTR_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_TYPE |
+    NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES | NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES |
+    NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE | NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID,
+    "CUPTI API Activity",
+    NVTX_PAYLOAD_SCHEMA_TYPE_STATIC,
+    NVTX_PAYLOAD_SCHEMA_FLAG_NONE,
+    cuptiApiSchema,
+    std::extent<decltype(cuptiApiSchema)>::value,
+    sizeof(struct NvidiaNvtxw::cuptiApiEvent),
+    0,
+    NvidiaNvtxw::PayloadSchemaId::cuptiApiId,
+    nullptr
+};
+static const nvtxPayloadSchemaEntry_t cuptiDeviceSchema[] = {
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT64, "global_memory_bandwidth"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT64, "global_memory_size"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "constant_memory_size"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "l2_cache_size"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "num_threads_per_warp"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "core_clock_rate"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "num_memcpy_engines"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "num_multiprocessors"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "max_ipc"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "max_warps_per_multiprocessor"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "max_blocks_per_multiprocessor"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "max_shared_memory_per_multiprocessor"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "max_registers_per_multiprocessor"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "max_registers_per_block"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "max_shared_memory_per_block"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "max_threads_per_block"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "max_block_dim_x"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "max_block_dim_y"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "max_block_dim_z"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "max_grid_dim_x"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "max_grid_dim_y"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "max_grid_dim_z"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "compute_capability_major"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "compute_capability_minor"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "id"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "ecc_enabled"),
+    PAYLOAD_ENTRY_SIMPLE(
+        NVTX_PAYLOAD_ENTRY_FLAG_EVENT_MESSAGE | NVTX_PAYLOAD_ENTRY_FLAG_POINTER,
+        NVTX_PAYLOAD_ENTRY_TYPE_CSTRING,
+        "name"
+    )   
+};
+static const nvtxPayloadSchemaAttr_t cuptiDeviceSchemaAttr = {
+    NVTX_PAYLOAD_SCHEMA_ATTR_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_TYPE |
+    NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES | NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES |
+    NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE | NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID,
+    "CUPTI Device",
+    NVTX_PAYLOAD_SCHEMA_TYPE_STATIC,
+    NVTX_PAYLOAD_SCHEMA_FLAG_NONE,
+    cuptiDeviceSchema,
+    std::extent<decltype(cuptiDeviceSchema)>::value,
+    sizeof(struct NvidiaNvtxw::cuptiDevice),
+    0,
+    NvidiaNvtxw::PayloadSchemaId::cuptiDeviceId,
+    nullptr
+};
+static const nvtxPayloadSchemaEntry_t cuptiKernelSchema[] = {
+    PAYLOAD_ENTRY_SIMPLE(
+        NVTX_PAYLOAD_ENTRY_FLAG_RANGE_BEGIN | NVTX_PAYLOAD_ENTRY_FLAG_EVENT_TIMESTAMP,
+        NVTX_PAYLOAD_ENTRY_TYPE_UINT64,
+        "time_start"
+    ),
+    PAYLOAD_ENTRY_SIMPLE(
+        NVTX_PAYLOAD_ENTRY_FLAG_RANGE_END | NVTX_PAYLOAD_ENTRY_FLAG_EVENT_TIMESTAMP,
+        NVTX_PAYLOAD_ENTRY_TYPE_UINT64,
+        "time_stop"
+    ),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT64, "completed"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT64, "grid_id"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT64, "queued"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT64, "submitted"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT64, "graph_node_id"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT64, "local_memory_total_v2"),
+    PAYLOAD_ENTRY_SIMPLE(
+        NVTX_PAYLOAD_ENTRY_FLAG_EVENT_MESSAGE | NVTX_PAYLOAD_ENTRY_FLAG_POINTER,
+        NVTX_PAYLOAD_ENTRY_TYPE_CSTRING,
+        "name"
+    ),    
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "device_id"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "context_id"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "stream_id"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_PID_UINT32, "process_id"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "grid_x"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "grid_y"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "grid_z"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "block_x"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "block_y"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "block_z"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "static_shared_memory"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "dynamic_shared_memory"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "local_memory_per_thread"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "local_memory_total"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "correlation_id"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "shared_memory_executed"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "graph_id"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "channel_id"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "cluster_x"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "cluster_y"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "cluster_z"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "cluster_scheduling_policy"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT16, "registers_per_thread"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT8, "requested"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT8, "executed"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT8, "shared_memory_config"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT8, "partitioned_global_cache_requested"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT8, "partitioned_global_cache_executed"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT8, "launch_type"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT8, "is_shared_memory_carveout_requested"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT8, "shared_memory_carveout_requested"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT8, "shmem_limit_config"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT8, "channel_type")
+};
+static const nvtxPayloadSchemaAttr_t cuptiKernelSchemaAttr = {
+    NVTX_PAYLOAD_SCHEMA_ATTR_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_TYPE |
+    NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES | NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES |
+    NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE | NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID,
+    "CUPTI Kernel",
+    NVTX_PAYLOAD_SCHEMA_TYPE_STATIC,
+    NVTX_PAYLOAD_SCHEMA_FLAG_NONE,
+    cuptiKernelSchema,
+    std::extent<decltype(cuptiKernelSchema)>::value,
+    sizeof(struct NvidiaNvtxw::cuptiKernelEvent),
+    0,
+    NvidiaNvtxw::PayloadSchemaId::cuptiKernelId,
+    nullptr
+};
+static const nvtxPayloadSchemaEntry_t cuptiMemcpySchema[] = {
+    PAYLOAD_ENTRY_SIMPLE(
+        NVTX_PAYLOAD_ENTRY_FLAG_RANGE_BEGIN | NVTX_PAYLOAD_ENTRY_FLAG_EVENT_TIMESTAMP,
+        NVTX_PAYLOAD_ENTRY_TYPE_UINT64,
+        "time_start"
+    ),
+    PAYLOAD_ENTRY_SIMPLE(
+        NVTX_PAYLOAD_ENTRY_FLAG_RANGE_END | NVTX_PAYLOAD_ENTRY_FLAG_EVENT_TIMESTAMP,
+        NVTX_PAYLOAD_ENTRY_TYPE_UINT64,
+        "time_stop"
+    ),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT64, "bytes"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT64, "graph_node_id"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "device_id"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "context_id"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "stream_id"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_PID_UINT32, "process_id"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "correlation_id"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "runtime_correlation_id"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "graph_id"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "channel_id"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT8, "channel_type"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT8, "copy_kind"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT8, "src_kind"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT8, "dst_kind")
+};
+static const nvtxPayloadSchemaAttr_t cuptiMemcpySchemaAttr = {
+    NVTX_PAYLOAD_SCHEMA_ATTR_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_TYPE |
+    NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES | NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES |
+    NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE | NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID,
+    "CUPTI Memcpy",
+    NVTX_PAYLOAD_SCHEMA_TYPE_STATIC,
+    NVTX_PAYLOAD_SCHEMA_FLAG_NONE,
+    cuptiMemcpySchema,
+    std::extent<decltype(cuptiMemcpySchema)>::value,
+    sizeof(struct NvidiaNvtxw::cuptiMemcpyEvent),
+    0,
+    NvidiaNvtxw::PayloadSchemaId::cuptiMemcpyId,
+    nullptr
+};
+static const nvtxPayloadSchemaEntry_t cuptiMemsetSchema[] = {
+    PAYLOAD_ENTRY_SIMPLE(
+        NVTX_PAYLOAD_ENTRY_FLAG_RANGE_BEGIN | NVTX_PAYLOAD_ENTRY_FLAG_EVENT_TIMESTAMP,
+        NVTX_PAYLOAD_ENTRY_TYPE_UINT64,
+        "time_start"
+    ),
+    PAYLOAD_ENTRY_SIMPLE(
+        NVTX_PAYLOAD_ENTRY_FLAG_RANGE_END | NVTX_PAYLOAD_ENTRY_FLAG_EVENT_TIMESTAMP,
+        NVTX_PAYLOAD_ENTRY_TYPE_UINT64,
+        "time_stop"
+    ),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT64, "bytes"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT64, "graph_node_id"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "device_id"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "context_id"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "stream_id"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_PID_UINT32, "process_id"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "correlation_id"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "graph_id"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "channel_id"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "value"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT8, "channel_type"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT8, "mem_kind"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT8, "flags")
+};
+static const nvtxPayloadSchemaAttr_t cuptiMemsetSchemaAttr = {
+    NVTX_PAYLOAD_SCHEMA_ATTR_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_TYPE |
+    NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES | NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES |
+    NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE | NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID,
+    "CUPTI Memset",
+    NVTX_PAYLOAD_SCHEMA_TYPE_STATIC,
+    NVTX_PAYLOAD_SCHEMA_FLAG_NONE,
+    cuptiMemsetSchema,
+    std::extent<decltype(cuptiMemsetSchema)>::value,
+    sizeof(struct NvidiaNvtxw::cuptiMemsetEvent),
+    0,
+    NvidiaNvtxw::PayloadSchemaId::cuptiMemsetId,
+    nullptr
+};
+static const nvtxPayloadSchemaEntry_t cuptiOverheadSchema[] = {
+    PAYLOAD_ENTRY_SIMPLE(
+        NVTX_PAYLOAD_ENTRY_FLAG_RANGE_BEGIN | NVTX_PAYLOAD_ENTRY_FLAG_EVENT_TIMESTAMP,
+        NVTX_PAYLOAD_ENTRY_TYPE_UINT64,
+        "time_start"
+    ),
+    PAYLOAD_ENTRY_SIMPLE(
+        NVTX_PAYLOAD_ENTRY_FLAG_RANGE_END | NVTX_PAYLOAD_ENTRY_FLAG_EVENT_TIMESTAMP,
+        NVTX_PAYLOAD_ENTRY_TYPE_UINT64,
+        "time_stop"
+    ),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_PID_UINT32, "process_id"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_TID_UINT32, "thread_id"),
+    PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT8, "overhead_kind"),
+};
+static const nvtxPayloadSchemaAttr_t cuptiOverheadSchemaAttr = {
+    NVTX_PAYLOAD_SCHEMA_ATTR_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_TYPE |
+    NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES | NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES |
+    NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE | NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID,
+    "CUPTI Overhead",
+    NVTX_PAYLOAD_SCHEMA_TYPE_STATIC,
+    NVTX_PAYLOAD_SCHEMA_FLAG_NONE,
+    cuptiOverheadSchema,
+    std::extent<decltype(cuptiOverheadSchema)>::value,
+    sizeof(struct NvidiaNvtxw::cuptiOverheadEvent),
+    0,
+    NvidiaNvtxw::PayloadSchemaId::cuptiOverheadId,
+    nullptr
+};
+#undef PAYLOAD_ENTRY_SIMPLE
+
+const nvtxPayloadSchemaAttr_t* GetNameSchemaAttr()
+{
+    return &nameSchemaAttr;
+}
+const nvtxPayloadSchemaAttr_t* GetNvtxRangePushPopSchemaAttr()
+{
+    return &nvtxRangePushPopSchemaAttr;
+}
+const nvtxPayloadSchemaAttr_t* GetNvtxRangeStartEndSchemaAttr()
+{
+    return &nvtxRangeStartEndSchemaAttr;
+}
+const nvtxPayloadSchemaAttr_t* GetCuptiApiSchemaAttr()
+{
+    return &cuptiApiSchemaAttr;
+}
+const nvtxPayloadSchemaAttr_t* GetCuptiDeviceSchemaAttr()
+{
+    return &cuptiDeviceSchemaAttr;
+}
+const nvtxPayloadSchemaAttr_t* GetCuptiKernelSchemaAttr()
+{
+    return &cuptiKernelSchemaAttr;
+}
+const nvtxPayloadSchemaAttr_t* GetCuptiMemcpySchemaAttr()
+{
+    return &cuptiMemcpySchemaAttr;
+}
+const nvtxPayloadSchemaAttr_t* GetCuptiMemsetSchemaAttr()
+{
+    return &cuptiMemsetSchemaAttr;
+}
+const nvtxPayloadSchemaAttr_t* GetCuptiOverheadSchemaAttr()
+{
+    return &cuptiOverheadSchemaAttr;
+}
+}
\ No newline at end of file
diff --git a/src/main/cpp/profiler/NvtxwEvents.h b/src/main/cpp/profiler/NvtxwEvents.h
new file mode 100644
index 0000000000..6c6bff8304
--- /dev/null
+++ b/src/main/cpp/profiler/NvtxwEvents.h
@@ -0,0 +1,188 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) <year> NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * 
+ * Licensed under the Apache License v2.0 with LLVM Exceptions.
+ * See LICENSE.txt for license information.
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include <nvtx3/nvToolsExtPayload.h>
+
+namespace NvidiaNvtxw 
+{
+
+namespace PayloadSchemaId 
+{
+    static constexpr uint64_t nameId              = 0xffffff00;
+    static constexpr uint64_t nvtxRangePushPopId  = 0xffffff01;
+    static constexpr uint64_t cuptiApiId          = 0xffffff02;
+    static constexpr uint64_t cuptiMemcpyId       = 0xffffff03;
+    static constexpr uint64_t cuptiMemsetId       = 0xffffff04;
+    static constexpr uint64_t cuptiDeviceId       = 0xffffff05;
+    static constexpr uint64_t cuptiKernelId       = 0xffffff06;
+    static constexpr uint64_t cuptiOverheadId     = 0xffffff07;
+    static constexpr uint64_t nvtxRangeStartEndId = 0xffffff08;
+};
+
+const nvtxPayloadSchemaAttr_t* GetNameSchemaAttr();
+
+struct nvtxRangeEvent {
+    uint64_t time_start;
+    uint64_t time_stop;
+    const char* name;
+    uint32_t process_id;
+    uint32_t thread_id;
+    uint32_t color;
+};
+const nvtxPayloadSchemaAttr_t* GetNvtxRangePushPopSchemaAttr();
+const nvtxPayloadSchemaAttr_t* GetNvtxRangeStartEndSchemaAttr();
+struct cuptiApiEvent {
+    uint64_t time_start;
+    uint64_t time_stop;
+    uint32_t kind;
+    uint32_t cbid;
+    uint32_t process_id;
+    uint32_t thread_id;
+    uint32_t correlation_id;
+    uint32_t return_value;
+};
+const nvtxPayloadSchemaAttr_t* GetCuptiApiSchemaAttr();
+struct cuptiDevice {
+    uint64_t global_memory_bandwidth;
+    uint64_t global_memory_size;
+    uint32_t constant_memory_size;
+    uint32_t l2_cache_size;
+    uint32_t num_threads_per_warp;
+    uint32_t core_clock_rate;
+    uint32_t num_memcpy_engines;
+    uint32_t num_multiprocessors;
+    uint32_t max_ipc;
+    uint32_t max_warps_per_multiprocessor;
+    uint32_t max_blocks_per_multiprocessor;
+    uint32_t max_shared_memory_per_multiprocessor;
+    uint32_t max_registers_per_multiprocessor;
+    uint32_t max_registers_per_block;
+    uint32_t max_shared_memory_per_block;
+    uint32_t max_threads_per_block;
+    uint32_t max_block_dim_x;
+    uint32_t max_block_dim_y;
+    uint32_t max_block_dim_z;
+    uint32_t max_grid_dim_x;
+    uint32_t max_grid_dim_y;
+    uint32_t max_grid_dim_z;
+    uint32_t compute_capability_major;
+    uint32_t compute_capability_minor;
+    uint32_t id;
+    uint32_t ecc_enabled;
+    const char* name;
+};
+const nvtxPayloadSchemaAttr_t* GetCuptiDeviceSchemaAttr();
+struct cuptiKernelEvent {
+    uint64_t time_start;
+    uint64_t time_stop;
+    uint64_t completed;
+    uint64_t grid_id;
+    uint64_t queued;
+    uint64_t submitted;
+    uint64_t graph_node_id;
+    uint64_t local_memory_total_v2;
+    const char * name;
+    uint32_t device_id;
+    uint32_t context_id;
+    uint32_t stream_id;
+    uint32_t process_id;
+    uint32_t grid_x;
+    uint32_t grid_y;
+    uint32_t grid_z;
+    uint32_t block_x;
+    uint32_t block_y;
+    uint32_t block_z;
+    uint32_t static_shared_memory;
+    uint32_t dynamic_shared_memory;
+    uint32_t local_memory_per_thread;
+    uint32_t local_memory_total;
+    uint32_t correlation_id;
+    uint32_t shared_memory_executed;
+    uint32_t graph_id;
+    uint32_t channel_id;
+    uint32_t cluster_x;
+    uint32_t cluster_y;
+    uint32_t cluster_z;
+    uint32_t cluster_scheduling_policy;
+    uint16_t registers_per_thread;
+    uint8_t requested;
+    uint8_t executed;
+    uint8_t shared_memory_config;
+    uint8_t partitioned_global_cache_requested;
+    uint8_t partitioned_global_cache_executed;
+    uint8_t launch_type;
+    uint8_t is_shared_memory_carveout_requested;
+    uint8_t shared_memory_carveout_requested;
+    uint8_t shmem_limit_config;
+    uint8_t channel_type;
+};
+const nvtxPayloadSchemaAttr_t* GetCuptiKernelSchemaAttr();
+
+struct cuptiMemcpyEvent {
+    uint64_t time_start;
+    uint64_t time_stop;
+    uint64_t bytes;
+    uint64_t graph_node_id;
+    uint32_t device_id;
+    uint32_t context_id;
+    uint32_t stream_id;
+    uint32_t process_id;
+    uint32_t correlation_id;
+    uint32_t runtime_correlation_id;
+    uint32_t graph_id;
+    uint32_t channel_id;
+    uint8_t channelType;
+    uint8_t copy_kind;
+    uint8_t src_kind;
+    uint8_t dst_kind;
+};
+const nvtxPayloadSchemaAttr_t* GetCuptiMemcpySchemaAttr();
+
+struct cuptiMemsetEvent {
+    uint64_t time_start;
+    uint64_t time_stop;
+    uint64_t bytes;
+    uint64_t graph_node_id;    
+    uint32_t device_id;
+    uint32_t context_id;
+    uint32_t stream_id;
+    uint32_t process_id;
+    uint32_t correlation_id;
+    uint32_t graph_id;
+    uint32_t channel_id;
+    uint32_t value;
+    uint8_t channelType;
+    uint8_t mem_kind;
+    uint8_t flags;
+};
+const nvtxPayloadSchemaAttr_t* GetCuptiMemsetSchemaAttr();
+struct cuptiOverheadEvent {
+    uint64_t time_start;
+    uint64_t time_stop;
+    uint32_t process_id;
+    uint32_t thread_id;
+    uint8_t overhead_kind;
+};
+const nvtxPayloadSchemaAttr_t* GetCuptiOverheadSchemaAttr();
+
+}
diff --git a/src/main/cpp/profiler/README-nvtxw.txt b/src/main/cpp/profiler/README-nvtxw.txt
new file mode 100644
index 0000000000..87d6f37406
--- /dev/null
+++ b/src/main/cpp/profiler/README-nvtxw.txt
@@ -0,0 +1,22 @@
+1. NvtxwEvents.h, NvtxwEvents.cpp are copied from Nsight Systems source code. They need to be kept in sync between this project and Nsight Systems.
+
+2. Need to set the NVTXW_BACKEND environment variable for the libNvtxwBackend.so library in the host directory a current build of Nsight Systems. For example:
+   > export NVTXW_BACKEND=/opt/nvidia/nsight-systems/2024.6.0/host-linux-x64/libNvtxwBackend.so
+
+3. Run like this:
+      > ./target/jni/cmake-build/profiler/spark_rapids_profile_converter  -w -o file3021460.nsys-rep rapids-profile-3021460@jlowe-lcedt-driver.bin
+   and get output similar to this:
+      Backend implementation loaded!  Applying config string...
+      Loader config key/value pairs not provided
+      Creating report: "file3021460.nsys-rep"
+      - Created session: file3021460
+      Session config key/value pairs not provided
+      - Created stream: Stream1
+         Domain: SparkRAPIDS
+         Scope: 
+      - Destroyed stream: Stream1
+      3946 events imported
+      - Destroyed session: file3021460
+      Backend implementation prepared for unload.
+   
+4. Load into nsight systems UI: nsys-ui file3021460.nsys-rep
\ No newline at end of file
diff --git a/src/main/cpp/profiler/initialize_nvtxw.cpp b/src/main/cpp/profiler/initialize_nvtxw.cpp
new file mode 100644
index 0000000000..218b6c9785
--- /dev/null
+++ b/src/main/cpp/profiler/initialize_nvtxw.cpp
@@ -0,0 +1,202 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) <year> NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Licensed under the Apache License v2.0 with LLVM Exceptions.
+ * See LICENSE.txt for license information.
+ */
+
+#include <string>
+#include <fstream>
+#include <iostream>
+
+#include <cerrno>
+#include <cxxabi.h>
+#include <charconv>
+#include <filesystem>
+#include <fstream>
+#include <iostream>
+#include <optional>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+
+#include "nvtxw_events.h"
+
+bool createNvtxwStream(const nvtxwInterfaceCore_t *nvtxwInterface,
+  const nvtxwSessionHandle_t& session, 
+  const std::string & name,
+  const std::string & domain, 
+  nvtxwStreamHandle_t & stream)
+{
+  nvtxwResultCode_t result = NVTXW3_RESULT_SUCCESS;    
+  nvtxwStreamAttributes_t streamAttr = {
+    sizeof(nvtxwStreamAttributes_t),
+    name.c_str(),
+    domain.c_str(),
+    "",
+    NVTXW3_STREAM_ORDER_INTERLEAVING_NONE,
+    NVTXW3_STREAM_ORDERING_TYPE_UNKNOWN,
+    NVTXW3_STREAM_ORDERING_SKID_NONE,
+    0
+  };
+  result = nvtxwInterface->StreamOpen(&stream, session, &streamAttr);
+  if (result != NVTXW3_RESULT_SUCCESS)
+  {
+    fprintf(stderr, "StreamOpen failed with code %d\n", (int)result);
+    return false;
+  }
+  if (!stream.opaque)
+  {
+      fprintf(stderr, "StreamOpen returned null stream handle!\n");
+      return false;
+  }
+  return true;
+}
+
+/// outName: basename of output nsys-rep, without .nsys-rep extension
+int initialize_nvtxw(std::ifstream& in, const std::string& outName, 
+  void *& nvtxwModuleHandle,
+  nvtxwInterfaceCore_t *&nvtxwInterface,
+  nvtxwSessionHandle_t &session,
+  nvtxwStreamHandle_t &stream) {
+  nvtxwResultCode_t result = NVTXW3_RESULT_SUCCESS;
+  int errorCode = 0;
+  // initialize
+  static const char soNameDefault[] = "libNvtxwBackend.so";
+  const char *soName = soNameDefault;
+  const char *backend_env = getenv("NVTXW_BACKEND");
+  if (backend_env)
+  {
+    soName = backend_env;
+  }
+  nvtxwGetInterface_t getInterfaceFunc = nullptr;
+  result = nvtxwInitialize(
+      NVTXW3_INIT_MODE_LIBRARY_FILENAME,
+      soName,
+      &getInterfaceFunc,
+      &nvtxwModuleHandle);
+  if (result != NVTXW3_RESULT_SUCCESS)
+  {
+      fprintf(stderr, "nvtxwInitialize failed with code %d\n", (int)result);
+      if (result == NVTXW3_RESULT_LIBRARY_NOT_FOUND)
+          fprintf(stderr, "Failed to find %s\n", soName);
+      return 1;
+  }
+  if (!getInterfaceFunc)
+  {
+      fprintf(stderr, "nvtxwInitialize returned null nvtxwGetInterface_t!\n");
+      return 1;
+  }
+
+  const void* interfaceVoid;
+  result = getInterfaceFunc(
+      NVTXW3_INTERFACE_ID_CORE_V1,
+      &interfaceVoid);
+  if (result != NVTXW3_RESULT_SUCCESS)
+  {
+      fprintf(stderr, "getInterfaceFunc failed with code %d\n", (int)result);
+      return 1;
+  }
+  if (!interfaceVoid)
+  {
+      fprintf(stderr, "getInterfaceFunc returned null nvtxwInterface pointer!\n");
+      return 1;
+  }
+  nvtxwInterface = reinterpret_cast<nvtxwInterfaceCore_t*>((void*)interfaceVoid);
+
+  // session begin
+  char* sessionConfig = nullptr;
+  nvtxwSessionAttributes_t sessionAttr = {
+      sizeof(nvtxwSessionAttributes_t),
+      outName.c_str(),
+      sessionConfig
+  };
+  result = nvtxwInterface->SessionBegin(&session, &sessionAttr);
+  free(sessionConfig);
+  if (result != NVTXW3_RESULT_SUCCESS)
+  {
+      fprintf(stderr, "SessionBegin failed with code %d\n", (int)result);
+      return 1;
+  }
+  if (!session.opaque)
+  {
+      fprintf(stderr, "SessionBegin returned null session handle!\n");
+      return 1;
+  }
+
+  // stream open
+  std::string streamName("CUPTI");
+  std::string domainName("CUPTI");
+  bool valid = createNvtxwStream(nvtxwInterface, session, streamName, domainName, stream);
+  if (!valid)
+  {
+    errorCode |= 1;
+    return errorCode;
+  }
+  // schema register
+  result = nvtxwInterface->SchemaRegister(stream, NvidiaNvtxw::GetNameSchemaAttr());
+  if (result != NVTXW3_RESULT_SUCCESS)
+  {
+    fprintf(stderr, "SchemaRegister failed for 'nameSchema' with code %d\n", (int)result);
+    errorCode |= 2;
+  }
+  result = nvtxwInterface->SchemaRegister(stream, NvidiaNvtxw::GetNvtxRangePushPopSchemaAttr());
+  if (result != NVTXW3_RESULT_SUCCESS)
+  {
+    fprintf(stderr, "SchemaRegister failed with 'nvtxRangePushPopSchema' with code %d\n", (int)result);
+    errorCode |= 2;
+  }
+  result = nvtxwInterface->SchemaRegister(stream, NvidiaNvtxw::GetCuptiApiSchemaAttr());
+  if (result != NVTXW3_RESULT_SUCCESS)
+  {
+    fprintf(stderr, "SchemaRegister failed with 'cuptiApiSchema' with code %d\n", (int)result);
+    errorCode |= 2;
+  }
+  result = nvtxwInterface->SchemaRegister(stream, NvidiaNvtxw::GetCuptiDeviceSchemaAttr());
+  if (result != NVTXW3_RESULT_SUCCESS)
+  {
+    fprintf(stderr, "SchemaRegister failed with 'cuptiDeviceSchema' with code %d\n", (int)result);
+    errorCode |= 2;
+  }        
+  result = nvtxwInterface->SchemaRegister(stream, NvidiaNvtxw::GetCuptiKernelSchemaAttr());
+  if (result != NVTXW3_RESULT_SUCCESS)
+  {
+    fprintf(stderr, "SchemaRegister failed with 'cuptiKernelSchema' with code %d\n", (int)result);
+    errorCode |= 2;
+  }        
+  result = nvtxwInterface->SchemaRegister(stream, NvidiaNvtxw::GetCuptiMemcpySchemaAttr());
+  if (result != NVTXW3_RESULT_SUCCESS)
+  {
+    fprintf(stderr, "SchemaRegister failed with 'cuptiMemcpySchema' with code %d\n", (int)result);
+    errorCode |= 2;
+  }
+  result = nvtxwInterface->SchemaRegister(stream, NvidiaNvtxw::GetCuptiMemsetSchemaAttr());
+  if (result != NVTXW3_RESULT_SUCCESS)
+  {
+    fprintf(stderr, "SchemaRegister failed with 'cuptiMemsetSchema' with code %d\n", (int)result);
+    errorCode |= 2;
+  }
+  result = nvtxwInterface->SchemaRegister(stream, NvidiaNvtxw::GetCuptiOverheadSchemaAttr());
+  if (result != NVTXW3_RESULT_SUCCESS)
+  {
+    fprintf(stderr, "SchemaRegister failed with 'cuptiOverheadSchema' with code %d\n", (int)result);
+    errorCode |= 2;
+  }        
+  return errorCode;
+}
diff --git a/src/main/cpp/profiler/nvtx3/nvToolsExtPayload.h b/src/main/cpp/profiler/nvtx3/nvToolsExtPayload.h
new file mode 100644
index 0000000000..3c750f7b13
--- /dev/null
+++ b/src/main/cpp/profiler/nvtx3/nvToolsExtPayload.h
@@ -0,0 +1,1173 @@
+/*
+* Copyright 2021-2024  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#include "nvtx3/nvToolsExt.h"
+
+/* Optionally include helper macros. */
+/* #include "nvToolsExtPayloadHelper.h" */
+
+/**
+ * If needed, semantic extension headers can be included after this header.
+ */
+
+/**
+ * \brief The compatibility ID is used for versioning of this extension.
+ */
+#ifndef NVTX_EXT_PAYLOAD_COMPATID
+#define NVTX_EXT_PAYLOAD_COMPATID 0x0104
+#endif
+
+/**
+ * \brief The module ID identifies the payload extension. It has to be unique
+ * among the extension modules.
+ */
+#ifndef NVTX_EXT_PAYLOAD_MODULEID
+#define NVTX_EXT_PAYLOAD_MODULEID 2
+#endif
+
+/**
+ * \brief Additional value for the enum @ref nvtxPayloadType_t
+ */
+#ifndef NVTX_PAYLOAD_TYPE_EXT
+#define NVTX_PAYLOAD_TYPE_EXT ((int32_t)0xDFBD0009)
+#endif
+
+/** ---------------------------------------------------------------------------
+ * Payload schema entry flags. Used for @ref nvtxPayloadSchemaEntry_t::flags.
+ * ------------------------------------------------------------------------- */
+#ifndef NVTX_PAYLOAD_ENTRY_FLAGS_V1
+#define NVTX_PAYLOAD_ENTRY_FLAGS_V1
+
+#define NVTX_PAYLOAD_ENTRY_FLAG_UNUSED 0
+
+/**
+ * Absolute pointer into a payload (entry) of the same event.
+ */
+#define NVTX_PAYLOAD_ENTRY_FLAG_POINTER          (1 << 1)
+
+/**
+ * Offset from base address of the payload.
+ */
+#define NVTX_PAYLOAD_ENTRY_FLAG_OFFSET_FROM_BASE (1 << 2)
+
+/**
+ * Offset from the end of this payload entry.
+ */
+#define NVTX_PAYLOAD_ENTRY_FLAG_OFFSET_FROM_HERE (1 << 3)
+
+/**
+ * The value is an array with fixed length, set with the field `arrayLength`.
+ */
+#define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_FIXED_SIZE           (1 << 4)
+
+/**
+ * The value is a zero-/null-terminated array.
+ */
+#define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_ZERO_TERMINATED      (2 << 4)
+
+/**
+ * \brief A single or multi-dimensional array of variable length.
+ *
+ * The field `arrayOrUnionDetail` contains the index of the schema entry that
+ * holds the length(s). If the length entry is a scalar, then this entry is a 1D
+ * array. If the length entry is a fixed-size array, then the number of
+ * dimensions is defined with the registration of the schema. If the length
+ * entry is a zero-terminated array, then the array of the dimensions can be
+ * determined at runtime.
+ * For multidimensional arrays, values are stored in row-major order, with rows
+ * being stored consecutively in contiguous memory. The size of the entry (in
+ * bytes) is the product of the dimensions multiplied with size of the array
+ * element.
+ */
+#define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_LENGTH_INDEX         (3 << 4)
+
+/**
+ * \brief A single or multi-dimensional array of variable length, where the
+ * dimensions are stored in a different payload (index) of the same event.
+ *
+ * This enables an existing address to an array to be directly passed, while the
+ * dimensions are defined in a separate payload (with only one payload entry).
+ */
+#define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_LENGTH_PAYLOAD_INDEX (4 << 4)
+
+/**
+ * \brief The value or data that is pointed to by this payload entry value shall
+ * be copied by the NVTX handler.
+ *
+ * A tool may not support deep copy and just ignore this flag.
+ * See @ref NVTX_PAYLOAD_SCHEMA_FLAG_DEEP_COPY for more details.
+ */
+#define NVTX_PAYLOAD_ENTRY_FLAG_DEEP_COPY          (1 << 8)
+
+/**
+ * Notifies the NVTX handler to hide this entry in case of visualization.
+ */
+#define NVTX_PAYLOAD_ENTRY_FLAG_HIDE               (1 << 9)
+
+/**
+ * The entry specifies the event message. Any string type can be used.
+ */
+#define NVTX_PAYLOAD_ENTRY_FLAG_EVENT_MESSAGE      (1 << 10)
+
+/**
+ * \brief The entry contains an event timestamp.
+ *
+ * The time source might be provided via the entry semantics field. In most
+ * cases, the timestamp (entry) type is @ref NVTX_PAYLOAD_ENTRY_TYPE_UINT64.
+ */
+#define NVTX_PAYLOAD_ENTRY_FLAG_EVENT_TIMESTAMP    (2 << 10)
+
+/**
+ * These flags specify the NVTX event type to which an entry refers.
+ */
+#define NVTX_PAYLOAD_ENTRY_FLAG_RANGE_BEGIN        (1 << 12)
+#define NVTX_PAYLOAD_ENTRY_FLAG_RANGE_END          (2 << 12)
+#define NVTX_PAYLOAD_ENTRY_FLAG_MARK               (3 << 12)
+#define NVTX_PAYLOAD_ENTRY_FLAG_COUNTER            (4 << 12)
+
+#endif /* NVTX_PAYLOAD_ENTRY_FLAGS_V1 */
+/** ---------------------------------------------------------------------------
+ * END: Payload schema entry flags.
+ * ------------------------------------------------------------------------- */
+
+/** \todo: Keep this in the header? */
+/**
+ * @note The ‘array’ flags assume that the array is embedded. Otherwise,
+ * @ref NVTX_PAYLOAD_ENTRY_FLAG_POINTER has to be additionally specified. Some
+ * combinations may be invalid based on the `NVTX_PAYLOAD_SCHEMA_TYPE_*` this
+ * entry is enclosed. For instance, variable length embedded arrays are valid
+ * within @ref NVTX_PAYLOAD_SCHEMA_TYPE_DYNAMIC but invalid with
+ * @ref NVTX_PAYLOAD_SCHEMA_TYPE_STATIC. See `NVTX_PAYLOAD_SCHEMA_TYPE_*` for
+ * additional details.
+ */
+
+/* Helper macro to check if an entry represents an array. */
+#define NVTX_PAYLOAD_ENTRY_FLAG_IS_ARRAY (\
+    NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_FIXED_SIZE | \
+    NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_ZERO_TERMINATED | \
+    NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_LENGTH_INDEX)
+
+#define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_TYPE(F) \
+    (F & NVTX_PAYLOAD_ENTRY_FLAG_IS_ARRAY)
+/** \todo end */
+
+
+/** ---------------------------------------------------------------------------
+ * Types of entries in a payload schema.
+ *
+ * @note Several of the predefined types contain the size (in bits) in their
+ * names. For some data types the size (in bytes) is not fixed and may differ
+ * for different platforms/operating systems/compilers. To provide portability,
+ * an array of sizes (in bytes) for type 1 to 28 ( @ref
+ * NVTX_PAYLOAD_ENTRY_TYPE_CHAR to @ref NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE)
+ * is passed to the NVTX extension initialization function
+ * @ref InitializeInjectionNvtxExtension via the `extInfo` field of
+ * @ref nvtxExtModuleInfo_t.
+ * ------------------------------------------------------------------------- */
+#ifndef NVTX_PAYLOAD_ENTRY_TYPES_V1
+#define NVTX_PAYLOAD_ENTRY_TYPES_V1
+
+#define NVTX_PAYLOAD_ENTRY_TYPE_INVALID     0
+
+/**
+ * Basic integer types.
+ */
+#define NVTX_PAYLOAD_ENTRY_TYPE_CHAR        1
+#define NVTX_PAYLOAD_ENTRY_TYPE_UCHAR       2
+#define NVTX_PAYLOAD_ENTRY_TYPE_SHORT       3
+#define NVTX_PAYLOAD_ENTRY_TYPE_USHORT      4
+#define NVTX_PAYLOAD_ENTRY_TYPE_INT         5
+#define NVTX_PAYLOAD_ENTRY_TYPE_UINT        6
+#define NVTX_PAYLOAD_ENTRY_TYPE_LONG        7
+#define NVTX_PAYLOAD_ENTRY_TYPE_ULONG       8
+#define NVTX_PAYLOAD_ENTRY_TYPE_LONGLONG    9
+#define NVTX_PAYLOAD_ENTRY_TYPE_ULONGLONG  10
+
+/**
+ * Integer types with explicit size.
+ */
+#define NVTX_PAYLOAD_ENTRY_TYPE_INT8       11
+#define NVTX_PAYLOAD_ENTRY_TYPE_UINT8      12
+#define NVTX_PAYLOAD_ENTRY_TYPE_INT16      13
+#define NVTX_PAYLOAD_ENTRY_TYPE_UINT16     14
+#define NVTX_PAYLOAD_ENTRY_TYPE_INT32      15
+#define NVTX_PAYLOAD_ENTRY_TYPE_UINT32     16
+#define NVTX_PAYLOAD_ENTRY_TYPE_INT64      17
+#define NVTX_PAYLOAD_ENTRY_TYPE_UINT64     18
+
+/**
+ * Floating point types
+ */
+#define NVTX_PAYLOAD_ENTRY_TYPE_FLOAT      19
+#define NVTX_PAYLOAD_ENTRY_TYPE_DOUBLE     20
+#define NVTX_PAYLOAD_ENTRY_TYPE_LONGDOUBLE 21
+
+/**
+ * Size type (`size_t` in C).
+ */
+#define NVTX_PAYLOAD_ENTRY_TYPE_SIZE       22
+
+/**
+ * Any address, e.g. `void*`. If the pointer type matters, use the flag @ref
+ * NVTX_PAYLOAD_ENTRY_FLAG_POINTER and the respective type instead.
+ */
+#define NVTX_PAYLOAD_ENTRY_TYPE_ADDRESS    23
+
+/**
+ * Special character types.
+ */
+#define NVTX_PAYLOAD_ENTRY_TYPE_WCHAR      24 /* wide character (since C90) */
+#define NVTX_PAYLOAD_ENTRY_TYPE_CHAR8      25 /* since C2x and C++20 */
+#define NVTX_PAYLOAD_ENTRY_TYPE_CHAR16     26
+#define NVTX_PAYLOAD_ENTRY_TYPE_CHAR32     27
+
+/**
+ * There is type size and alignment information for all previous types.
+ */
+#define NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE (NVTX_PAYLOAD_ENTRY_TYPE_CHAR32 + 1)
+
+/**
+ * Store raw 8-bit binary data. As with `char`, 1-byte alignment is assumed.
+ * Typically, a tool will display this as hex or binary.
+ */
+#define NVTX_PAYLOAD_ENTRY_TYPE_BYTE       32
+
+/**
+ * These types do not have standardized equivalents. It is assumed that the
+ * number at the end corresponds to the bits used to store the value and that
+ * the alignment corresponds to standardized types of the same size.
+ * A tool may not support these types.
+ */
+#define NVTX_PAYLOAD_ENTRY_TYPE_INT128     33
+#define NVTX_PAYLOAD_ENTRY_TYPE_UINT128    34
+
+#define NVTX_PAYLOAD_ENTRY_TYPE_FLOAT16    42
+#define NVTX_PAYLOAD_ENTRY_TYPE_FLOAT32    43
+#define NVTX_PAYLOAD_ENTRY_TYPE_FLOAT64    44
+#define NVTX_PAYLOAD_ENTRY_TYPE_FLOAT128   45
+
+#define NVTX_PAYLOAD_ENTRY_TYPE_BF16       50
+#define NVTX_PAYLOAD_ENTRY_TYPE_TF32       52
+
+/**
+ * Data types are as defined by NVTXv3 core.
+ */
+#define NVTX_PAYLOAD_ENTRY_TYPE_CATEGORY   68 /* uint32_t */
+#define NVTX_PAYLOAD_ENTRY_TYPE_COLOR_ARGB 69 /* uint32_t */
+
+/**
+ * The scope of events or counters (see `nvtxScopeRegister`).
+ */
+#define NVTX_PAYLOAD_ENTRY_TYPE_SCOPE_ID   70
+
+/**
+ * Process ID as scope 
+ */
+#define NVTX_PAYLOAD_ENTRY_TYPE_PID_UINT32 71
+#define NVTX_PAYLOAD_ENTRY_TYPE_PID_UINT64 72
+
+/**
+ * Thread ID as scope (see `nvtxGetActiveThreadId` for valid values).
+ */
+#define NVTX_PAYLOAD_ENTRY_TYPE_TID_UINT32 73
+#define NVTX_PAYLOAD_ENTRY_TYPE_TID_UINT64 74
+
+/**
+ * \brief String types.
+ *
+ * If no flags are set for the entry and `arrayOrUnionDetail > 0`, the entry is
+ * assumed to be a fixed-size string with the given length, embedded in the payload.
+ * `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_FIXED_SIZE` is redundant for fixed-size strings.
+ *
+ * \todo(Revise the following paragraph.)
+ * Setting the flag `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_ZERO_TERMINATED` specifies a
+ * zero-terminated string. If `arrayOrUnionDetail > 0`, the entry is handled as
+ * a zero-terminated array of fixed-size strings.
+ *
+ * Setting the flag `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_LENGTH_INDEX` specifies a
+ * variable-length string with the length given in the entry specified by the
+ * field `arrayOrUnionDetail`.
+ */
+#define NVTX_PAYLOAD_ENTRY_TYPE_CSTRING       75 /* `char*`, system LOCALE */
+#define NVTX_PAYLOAD_ENTRY_TYPE_CSTRING_UTF8  76
+#define NVTX_PAYLOAD_ENTRY_TYPE_CSTRING_UTF16 77
+#define NVTX_PAYLOAD_ENTRY_TYPE_CSTRING_UTF32 78
+
+/**
+ * The entry value is of type @ref nvtxStringHandle_t returned by
+ * @ref nvtxDomainRegisterString.
+ */
+#define NVTX_PAYLOAD_ENTRY_TYPE_NVTX_REGISTERED_STRING_HANDLE 80
+
+/**
+ * This type marks the union selector member (entry index) in schemas used by
+ * a union with internal selector.
+ * See @ref NVTX_PAYLOAD_SCHEMA_TYPE_UNION_WITH_INTERNAL_SELECTOR.
+ */
+#define NVTX_PAYLOAD_ENTRY_TYPE_UNION_SELECTOR 100
+
+/**
+ * \brief Predefined schema ID for payload data that is referenced in another payload.
+ *
+ * This schema ID can be used in @ref nvtxPayloadData_t::schema_id to indicate that the
+ * payload is a blob of memory which other payload entries may point into.
+ * A tool will not expose this payload directly.
+ *
+ * This schema ID cannot be used as schema entry type!
+ */
+#define NVTX_TYPE_PAYLOAD_SCHEMA_REFERENCED 1022
+
+/**
+ * \brief Predefined schema ID for raw payload data.
+ *
+ * This schema ID can be used in @ref nvtxPayloadData_t::schema_id to indicate
+ * that the payload is a blob, which can be shown with an arbitrary data viewer.
+ * This schema ID cannot be used as schema entry type!
+ */
+#define NVTX_TYPE_PAYLOAD_SCHEMA_RAW        1023
+
+/**
+ * \deprecated: Remove for official release!
+ * In the initial version of this header custom schema IDs started
+ * here. Unless predefined types require more than 16 bits we can keep this
+ * value to preserve backwards compatibility. The value is not used as first
+ * ID for custom schemas any more, but in the analysis every entry type >= this
+ * value is assumed to be a custom schema.
+ */
+#define NVTX_PAYLOAD_ENTRY_TYPE_CUSTOM_BASE 65536
+
+/* Custom (static) schema IDs. */
+#define NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START  (1 << 24)
+
+/* Dynamic schema IDs (generated by the tool) start here. */
+#define NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_DYNAMIC_START ((uint64_t)1 << 32)
+
+#endif /* NVTX_PAYLOAD_ENTRY_TYPES_V1 */
+/** ---------------------------------------------------------------------------
+ * END: Payload schema entry types.
+ * ------------------------------------------------------------------------- */
+
+
+#ifndef NVTX_PAYLOAD_SCHEMA_TYPES_V1
+#define NVTX_PAYLOAD_SCHEMA_TYPES_V1
+
+/**
+ * \brief The payload schema type.
+ *
+ * A schema can be either of the following types. It is set with
+ * @ref nvtxPayloadSchemaAttr_t::type.
+ */
+#define NVTX_PAYLOAD_SCHEMA_TYPE_INVALID                      0
+#define NVTX_PAYLOAD_SCHEMA_TYPE_STATIC                       1
+#define NVTX_PAYLOAD_SCHEMA_TYPE_DYNAMIC                      2
+#define NVTX_PAYLOAD_SCHEMA_TYPE_UNION                        3
+#define NVTX_PAYLOAD_SCHEMA_TYPE_UNION_WITH_INTERNAL_SELECTOR 4
+
+#endif /* NVTX_PAYLOAD_SCHEMA_TYPES_V1 */
+
+
+#ifndef NVTX_PAYLOAD_SCHEMA_FLAGS_V1
+#define NVTX_PAYLOAD_SCHEMA_FLAGS_V1
+
+/**
+ * \brief Flags for static and dynamic schemas.
+ *
+ * The schema flags are used with @ref nvtxPayloadSchemaAttr_t::flags.
+ */
+#define NVTX_PAYLOAD_SCHEMA_FLAG_NONE           0
+
+/**
+ * This flag indicates that a schema and the corresponding payloads can
+ * contain fields which require a deep copy.
+ */
+#define NVTX_PAYLOAD_SCHEMA_FLAG_DEEP_COPY      (1 << 1)
+
+/**
+ * This flag indicates that a schema and the corresponding payload can be
+ * referenced by another payload of the same event. If the schema is not
+ * intended to be visualized directly, it is possible use
+ * @ref NVTX_TYPE_PAYLOAD_SCHEMA_REFERENCED instead.
+ */
+#define NVTX_PAYLOAD_SCHEMA_FLAG_REFERENCED     (1 << 2)
+
+/**
+ * The schema defines a counter group. An NVTX handler can expect that the schema
+ * contains entries with counter semantics.
+ */
+#define NVTX_PAYLOAD_SCHEMA_FLAG_COUNTER_GROUP  (1 << 3)
+
+
+#endif /* NVTX_PAYLOAD_SCHEMA_FLAGS_V1 */
+
+
+#ifndef NVTX_PAYLOAD_SCHEMA_ATTRS_V1
+#define NVTX_PAYLOAD_SCHEMA_ATTRS_V1
+
+/**
+ * The values allow the valid fields in @ref nvtxPayloadSchemaAttr_t to be
+ * specified via setting the field `fieldMask`.
+ */
+#define NVTX_PAYLOAD_SCHEMA_ATTR_NAME        (1 << 1)
+#define NVTX_PAYLOAD_SCHEMA_ATTR_TYPE        (1 << 2)
+#define NVTX_PAYLOAD_SCHEMA_ATTR_FLAGS       (1 << 3)
+#define NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES     (1 << 4)
+#define NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES (1 << 5)
+#define NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE (1 << 6)
+#define NVTX_PAYLOAD_SCHEMA_ATTR_ALIGNMENT   (1 << 7)
+#define NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID   (1 << 8)
+#define NVTX_PAYLOAD_SCHEMA_ATTR_EXTENSION   (1 << 9)
+
+#endif /* NVTX_PAYLOAD_SCHEMA_ATTRS_V1 */
+
+
+#ifndef NVTX_PAYLOAD_ENUM_ATTRS_V1
+#define NVTX_PAYLOAD_ENUM_ATTRS_V1
+
+/**
+ * The values are used to set the field `fieldMask` and specify which fields in
+ * @ref nvtxPayloadEnumAttr_t are set.
+ */
+#define NVTX_PAYLOAD_ENUM_ATTR_NAME        (1 << 1)
+#define NVTX_PAYLOAD_ENUM_ATTR_ENTRIES     (1 << 2)
+#define NVTX_PAYLOAD_ENUM_ATTR_NUM_ENTRIES (1 << 3)
+#define NVTX_PAYLOAD_ENUM_ATTR_SIZE        (1 << 4)
+#define NVTX_PAYLOAD_ENUM_ATTR_SCHEMA_ID   (1 << 5)
+#define NVTX_PAYLOAD_ENUM_ATTR_EXTENSION   (1 << 6)
+
+#endif /* NVTX_PAYLOAD_ENUM_ATTRS_V1 */
+
+/** Deprecated NVTX scope defines. */
+#ifndef NVTX_SCOPES_V0
+#define NVTX_SCOPES_V0
+
+#define NVTX_EVENT_SCOPE_INVALID       	0
+#define NVTX_EVENT_SCOPE_NONE           1 /* Global/base/root or no scope */
+
+/* Hardware events */
+#define NVTX_EVENT_SCOPE_HW_MACHINE     2 /* Node/machine name, Device? */
+#define NVTX_EVENT_SCOPE_HW_SOCKET      3
+#define NVTX_EVENT_SCOPE_HW_CPU         4
+#define NVTX_EVENT_SCOPE_HW_CPU_LOGICAL 5
+/* Innermost HW execution context at registration time */
+#define NVTX_EVENT_SCOPE_HW_INNERMOST   6
+
+/* Virtualized hardware, virtual machines */
+#define NVTX_EVENT_SCOPE_VM             7
+
+/* Software scopes */
+#define NVTX_EVENT_SCOPE_SW_PROCESS     8 /* Process scope */
+#define NVTX_EVENT_SCOPE_SW_THREAD      9 /* Thread scope */
+/* Innermost SW execution context at registration time */
+#define NVTX_EVENT_SCOPE_SW_INNERMOST   10
+
+#endif /* NVTX_SCOPES_V0 */
+
+/**
+ * An NVTX scope specifies the execution scope or source of events or counters.
+ * A tool determines the value for a predefined scope when the sample is taken.
+ */
+#ifndef NVTX_SCOPES_V1
+#define NVTX_SCOPES_V1
+
+#define NVTX_SCOPE_NONE                    0 /* No scope specified. */
+#define NVTX_SCOPE_ROOT                    1 /* The root in a hierarchy. */
+
+/* Hardware events */
+#define NVTX_SCOPE_CURRENT_HW_MACHINE      2 /* Node/machine name */
+#define NVTX_SCOPE_CURRENT_HW_SOCKET       3
+#define NVTX_SCOPE_CURRENT_HW_CPU_PHYSICAL 4 /* Physical CPU core */
+#define NVTX_SCOPE_CURRENT_HW_CPU_LOGICAL  5 /* Logical CPU core */
+/* Innermost HW execution context */
+#define NVTX_SCOPE_CURRENT_HW_INNERMOST   15
+
+/* Virtualized hardware, virtual machines */
+#define NVTX_SCOPE_CURRENT_HYPERVISOR     16
+#define NVTX_SCOPE_CURRENT_VM             17
+#define NVTX_SCOPE_CURRENT_KERNEL         18
+#define NVTX_SCOPE_CURRENT_CONTAINER      19
+#define NVTX_SCOPE_CURRENT_OS             20
+
+/* Software scopes */
+#define NVTX_SCOPE_CURRENT_SW_PROCESS     21 /* Process scope */
+#define NVTX_SCOPE_CURRENT_SW_THREAD      22 /* Thread scope */
+/* Innermost SW execution context */
+#define NVTX_SCOPE_CURRENT_SW_INNERMOST   31
+
+/** Static (user-provided) scope IDs (feed forward) */
+#define NVTX_SCOPE_ID_STATIC_START  (1 << 24)
+
+/* Dynamically (tool) generated scope IDs */
+#define NVTX_SCOPE_ID_DYNAMIC_START 4294967296  /* 1 << 32 */
+
+#endif /* NVTX_SCOPES_V1 */
+
+
+#ifndef NVTX_DEFERRED_EVENTS_SORTING_V1
+#define NVTX_DEFERRED_EVENTS_SORTING_V1
+/**
+ * Deferred events are assumed to be in chronologically order by default.
+ */
+#define NVTX_DEFERRED_EVENTS_SORTED                  0
+#define NVTX_DEFERRED_EVENTS_SORTED_PER_EVENT_SOURCE 1
+#define NVTX_DEFERRED_EVENTS_UNSORTED                2
+
+#endif /* NVTX_DEFERRED_EVENTS_SORTING_V1 */
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#ifndef NVTX_PAYLOAD_TYPEDEFS_V1
+#define NVTX_PAYLOAD_TYPEDEFS_V1
+
+/**
+ * \brief Size and alignment information for predefined payload entry types.
+ *
+ * The struct contains the size and the alignment size in bytes. A respective
+ * array for the predefined types is passed via nvtxExtModuleInfo_t to the NVTX
+ * client/handler. The type (ID) is used as index into this array.
+ */
+typedef struct nvtxPayloadEntryTypeInfo_v1
+{
+    uint16_t size;
+    uint16_t align;
+} nvtxPayloadEntryTypeInfo_t;
+
+/**
+ * \brief Binary payload data, size and decoding information.
+ *
+ * An array of type `nvtxPayloadData_t` is passed to the NVTX event attached to
+ * an NVTX event via the `payload.ullvalue` field of NVTX event attributes.
+ *
+ * The `schemaId` be a predefined schema entry type (`NVTX_PAYLOAD_ENTRY_TYPE*`),
+ * a schema ID (statically specified or dynamically created) or one of
+ * `NVTX_PAYLOAD_TYPE_REFERENCED` or `NVTX_PAYLOAD_TYPE_RAW`.
+ *
+ * Setting the size of a payload to `MAX_SIZE` can be useful to reduce the
+ * overhead of NVTX instrumentation, when no NVTX handler is attached. However,
+ * a tool might not be able to detect the size of a payload and thus skip it.
+ * A reasonable use case is a payload that represents a null-terminated
+ * C string, where the NVTX handler can call `strlen()`.
+ */
+typedef struct nvtxPayloadData_v1
+{
+    /**
+     * The schema ID, which defines the layout of the binary data.
+     */
+    uint64_t    schemaId;
+
+    /**
+     * Size of the payload (blob) in bytes. `SIZE_MAX` (`-1`) indicates the tool
+     * that it should figure out the size, which might not be possible.
+     */
+    size_t      size;
+
+    /**
+     * Pointer to the binary payload data.
+     */
+    const void* payload;
+} nvtxPayloadData_t;
+
+
+/**
+ * \brief Header of the payload entry's semantic field.
+ *
+ * If the semantic field of the payload schema entry is set, the first four
+ * fields (header) are defined with this type. A tool can iterate through the
+ * extensions and check, if it supports (can handle) it.
+ */
+typedef struct nvtxSemanticsHeader_v1
+{
+    uint32_t structSize; /** Size of semantic extension struct. */
+    uint16_t semanticId;
+    uint16_t version;
+    const struct nvtxSemanticsHeader_v1* next; /** linked list */
+    /* Additional fields are defined by the specific semantic extension. */
+} nvtxSemanticsHeader_t;
+
+/**
+ * \brief Entry in a schema.
+ *
+ * A payload schema consists of an array of payload schema entries. It is
+ * registered with @ref nvtxPayloadSchemaRegister. `flag` can be set to `0` for
+ * simple values, 'type' is the only "required" field. If not set explicitly,
+ * all other fields are zero-initialized, which means that the entry has no name
+ * and the offset is determined based on self-alignment rules.
+ *
+ * Example schema:
+ *  nvtxPayloadSchemaEntry_t schema[] = {
+ *      {0, NVTX_EXT_PAYLOAD_TYPE_UINT8, "one byte"},
+ *      {0, NVTX_EXT_PAYLOAD_TYPE_INT32, "four bytes"}
+ *  };
+ */
+typedef struct nvtxPayloadSchemaEntry_v1
+{
+    /**
+     * \brief Flags to augment the basic type.
+     *
+     * This field allows additional properties of the payload entry to be
+     * specified. Valid values are `NVTX_PAYLOAD_ENTRY_FLAG_*`.
+     */
+    uint64_t       flags;
+
+    /**
+     * \brief Predefined payload schema entry type or custom schema ID.
+     *
+     * Predefined types are `NVTX_PAYLOAD_ENTRY_TYPE_*`. Passing a schema ID
+     * enables nesting of schemas.
+     */
+    uint64_t       type;
+
+    /**
+     * \brief Name or label of the payload entry. (Optional)
+     *
+     * A meaningful name or label can help organizing and interpreting the data.
+     */
+    const char*    name;
+
+    /**
+     * \brief Description of the payload entry. (Optional)
+     *
+     * A more detail description of the data that is stored with this entry.
+     */
+    const char*    description;
+
+    /**
+     * \brief String length, array length or member selector for union types.
+     *
+     * If @ref type is a C string type, this field specifies the string length.
+     *
+     * If @ref flags specify that the entry is an array, this field specifies
+     * the array length. See `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_*` for more details.
+     *
+     * If @ref type is a union with schema type @ref NVTX_PAYLOAD_SCHEMA_TYPE_UNION
+     * (external selection of the union member), this field contains the index
+     * (starting with 0) to an entry of integral type in the same schema. The
+     * associated field value specifies the selected union member.
+     *
+     * @note An array of schema type @ref NVTX_PAYLOAD_SCHEMA_TYPE_UNION is not
+     * supported. @ref NVTX_PAYLOAD_SCHEMA_TYPE_UNION_WITH_INTERNAL_SELECTOR can
+     * be used instead.
+     */
+    uint64_t       arrayOrUnionDetail;
+
+    /**
+     * \brief Offset in the binary payload data (in bytes).
+     *
+     * This field specifies the byte offset from the base address of the actual
+     * binary data (blob) to the start address of the data of this entry.
+     *
+     * It is recommended (but not required) to provide the offset it. Otherwise,
+     * the NVTX handler will determine the offset from natural alignment rules.
+     * In some cases, e.g. dynamic schema layouts, the offset cannot be set and
+     * has to be determined based on the data of prior entries.
+     *
+     * Setting the offset can also be used to skip entries during payload parsing.
+     */
+    uint64_t       offset;
+
+    /**
+     * \brief Additional semantics of the payload entry.
+     *
+     * The field points to the first element in a linked list, which enables
+     * multiple semantic extensions.
+     */
+    const nvtxSemanticsHeader_t* semantics;
+
+    /**
+     * \brief Reserved for future use. Do not use it!
+     */
+    const void*    reserved;
+} nvtxPayloadSchemaEntry_t;
+
+/**
+ * \brief NVTX payload schema attributes.
+ */
+typedef struct nvtxPayloadSchemaAttr_v1
+{
+    /**
+     * \brief Mask of valid fields in this struct.
+     *
+     * Use the `NVTX_PAYLOAD_SCHEMA_ATTR_*` defines.
+     */
+    uint64_t                        fieldMask;
+
+    /**
+     * \brief Name of the payload schema. (Optional)
+     */
+    const char*                     name;
+
+    /**
+     * \brief Payload schema type. (Mandatory) \anchor PAYLOAD_TYPE_FIELD
+     *
+     * Use the `NVTX_PAYLOAD_SCHEMA_TYPE_*` defines.
+     */
+    uint64_t                        type;
+
+    /**
+     * \brief Payload schema flags. (Optional)
+     *
+     * Flags defined by `NVTX_PAYLOAD_SCHEMA_FLAG_*` can be used to set
+     * additional properties of the schema.
+     */
+    uint64_t                        flags;
+
+    /**
+     * \brief Entries of a payload schema. (Mandatory) \anchor ENTRIES_FIELD
+     *
+     * This field is a pointer to an array of schema entries, each describing a
+     * field in a data structure, e.g. in a C struct or union.
+     */
+    const nvtxPayloadSchemaEntry_t* entries;
+
+    /**
+     * \brief Number of entries in the payload schema. (Mandatory)
+     *
+     * Number of entries in the array of payload entries \ref ENTRIES_FIELD.
+     */
+    size_t                          numEntries;
+
+    /**
+     * \brief The binary payload size in bytes for static payload schemas.
+     *
+     * If \ref PAYLOAD_TYPE_FIELD is @ref NVTX_PAYLOAD_SCHEMA_TYPE_DYNAMIC this
+     * value is ignored. If this field is not specified for a schema of type
+     * @ref NVTX_PAYLOAD_SCHEMA_TYPE_STATIC, the size can be automatically
+     * determined by a tool.
+     */
+    size_t                          payloadStaticSize;
+
+    /**
+     * \brief The byte alignment for packed structures.
+     *
+     * If not specified, this field defaults to `0`, which means that the fields
+     * in the data structure are not packed and natural alignment rules can be
+     * applied.
+     */
+    size_t                          packAlign;
+
+    /**
+     * Static/custom schema ID must be
+     * >= NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START and
+     * < NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_DYNAMIC_START
+     */
+    uint64_t                        schemaId;
+
+    /**
+     * Flexible extension for schema attributes.
+     * (Do not use. Reserved for future use.)
+     */
+    void*                           extension;
+} nvtxPayloadSchemaAttr_t;
+
+/**
+ * \brief This type is used to describe an enumeration.
+ *
+ * Since the value of an enum entry might not be meaningful for the analysis
+ * and/or visualization, a tool can show the name of enum entry instead.
+ *
+ * An array of this struct is passed to @ref nvtxPayloadEnumAttr_t::entries to be
+ * finally registered via @ref nvtxPayloadEnumRegister with the NVTX handler.
+ *
+ * @note EXPERIMENTAL
+ */
+typedef struct nvtxPayloadEnum_v1
+{
+    /**
+     * Name of the enum value.
+     */
+    const char* name;
+
+    /**
+     * Value of the enum entry.
+     */
+    uint64_t    value;
+
+    /**
+     * Indicates that this entry sets a specific set of bits, which can be used
+     * to define bitsets.
+     */
+    int8_t      isFlag;
+} nvtxPayloadEnum_t;
+
+/**
+ * \brief NVTX payload enumeration type attributes.
+ *
+ * A pointer to this struct is passed to @ref nvtxPayloadEnumRegister.
+ */
+typedef struct nvtxPayloadEnumAttr_v1
+{
+    /**
+     * Mask of valid fields in this struct. See `NVTX_PAYLOAD_ENUM_ATTR_*`.
+     */
+    uint64_t                 fieldMask;
+
+    /**
+     * Name of the enum. (Optional)
+     */
+    const char*              name;
+
+    /**
+     * Entries of the enum. (Mandatory)
+     */
+    const nvtxPayloadEnum_t* entries;
+
+    /**
+     * Number of entries in the enum. (Mandatory)
+     */
+    size_t                   numEntries;
+
+    /**
+     * Size of enumeration type in bytes
+     */
+    size_t                   sizeOfEnum;
+
+    /**
+     * Static/custom schema ID must be
+     * >= NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START and
+     * < NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_DYNAMIC_START
+     */
+    uint64_t                 schemaId;
+
+    /**
+     * Flexible extension for enumeration attributes.
+     * (Do not use. Reserved for future use.)
+     */
+    void*                    extension;
+} nvtxPayloadEnumAttr_t;
+
+typedef struct nvtxScopeAttr_v1
+{
+    size_t structSize;
+
+    /**
+     * Path delimited by '/' characters, relative to parentScope. Leading
+     * slashes are ignored. Nodes in the path may use name[key] syntax to
+     * indicate an array of sibling nodes, which may be combined with other
+     * non-array nodes or different arrays at the same scope. Node names should
+     * be UTF8 printable characters. '\' has to be used to escape '/', '[', and
+     * ']' characters in node names. An empty C string "" and `NULL` are valid
+     * inputs and treated equivalently.
+     */
+    const char* path;
+
+    uint64_t parentScope;
+
+    /**
+     * The static scope ID must be unique within the domain,
+     * >= NVTX_SCOPE_ID_STATIC_START, and
+     * < NVTX_SCOPE_ID_DYNAMIC_START.
+     */
+    uint64_t scopeId;
+} nvtxScopeAttr_t;
+
+#endif /* NVTX_PAYLOAD_TYPEDEFS_V1 */
+
+#ifndef NVTX_PAYLOAD_API_FUNCTIONS_V1
+#define NVTX_PAYLOAD_API_FUNCTIONS_V1
+
+/**
+ * \brief Register a payload schema.
+ *
+ * @param domain NVTX domain handle.
+ * @param attr NVTX payload schema attributes.
+ */
+NVTX_DECLSPEC uint64_t NVTX_API nvtxPayloadSchemaRegister(
+    nvtxDomainHandle_t domain,
+    const nvtxPayloadSchemaAttr_t* attr);
+
+/**
+ * \brief Register an enumeration type with the payload extension.
+ *
+ * @param domain NVTX domain handle
+ * @param attr NVTX payload enumeration type attributes.
+ */
+NVTX_DECLSPEC uint64_t NVTX_API nvtxPayloadEnumRegister(
+    nvtxDomainHandle_t domain,
+    const nvtxPayloadEnumAttr_t* attr);
+
+/**
+ * \brief Register a scope.
+ *
+ * @param domain NVTX domain handle (0 for default domain)
+ * @param attr Scope attributes.
+ *
+ * @return an identifier for the scope. If the operation was not successful,
+ * `NVTX_SCOPE_NONE` is returned.
+ */
+NVTX_DECLSPEC uint64_t NVTX_API nvtxScopeRegister(
+    nvtxDomainHandle_t domain,
+    const nvtxScopeAttr_t* attr);
+
+/**
+ * \brief Marks an instantaneous event in the application with the attributes
+ * being passed via the extended payload.
+ *
+ * An NVTX handler can assume that the payload contains the event message.
+ * Otherwise, it might ignore the event.
+ *
+ * @param domain NVTX domain handle
+ * @param payloadData pointer to an array of structured payloads.
+ * @param count number of payload BLOBs.
+ */
+NVTX_DECLSPEC void NVTX_API nvtxMarkPayload(
+    nvtxDomainHandle_t domain,
+    const nvtxPayloadData_t* payloadData,
+    size_t count);
+
+/**
+ * \brief Begin a nested thread range with the attributes being passed via the
+ * payload.
+ *
+ * @param domain NVTX domain handle
+ * @param payloadData pointer to an array of structured payloads.
+ * @param count number of payload BLOBs.
+ *
+ * @return The level of the range being ended. If an error occurs a negative
+ * value is returned on the current thread.
+ */
+NVTX_DECLSPEC int NVTX_API nvtxRangePushPayload(
+    nvtxDomainHandle_t domain,
+    const nvtxPayloadData_t* payloadData,
+    size_t count);
+
+/**
+ * \brief End a nested thread range with an additional custom payload.
+ *
+ * NVTX event attributes passed to this function (via the payloads) overwrite
+ * event attributes (message and color) that have been set in the push event.
+ * Other payload entries extend the data of the range.
+ *
+ * @param domain NVTX domain handle
+ * @param payloadData pointer to an array of structured payloads.
+ * @param count number of payload BLOBs.
+ *
+ * @return The level of the range being ended. If an error occurs a negative
+ * value is returned on the current thread.
+ */
+NVTX_DECLSPEC int NVTX_API nvtxRangePopPayload(
+    nvtxDomainHandle_t domain,
+    const nvtxPayloadData_t* payloadData,
+    size_t count);
+
+/**
+ * \brief Start a thread range with attributes passed via the extended payload.
+ *
+ * @param domain NVTX domain handle
+ * @param payloadData pointer to an array of structured payloads.
+ * @param count number of payload BLOBs.
+ *
+ * @return The level of the range being ended. If an error occurs a negative
+ * value is returned on the current thread.
+ */
+NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartPayload(
+    nvtxDomainHandle_t domain,
+    const nvtxPayloadData_t* payloadData,
+    size_t count);
+
+/**
+ * \brief End a thread range and pass a custom payload.
+ *
+ * NVTX event attributes passed to this function (via the payloads) overwrite
+ * event attributes (message and color) that have been set in the start event.
+ * Other payload entries extend the data of the range.
+ *
+ * @param domain NVTX domain handle
+ * @param id The correlation ID returned from a NVTX range start call.
+ * @param payloadData pointer to an array of structured payloads.
+ * @param count number of payload BLOBs.
+ */
+NVTX_DECLSPEC void NVTX_API nvtxRangeEndPayload(
+    nvtxDomainHandle_t domain,
+    nvtxRangeId_t id,
+    const nvtxPayloadData_t* payloadData,
+    size_t count);
+
+/**
+ * @brief Checks if an NVTX domain is enabled.
+ *
+ * @param domain NVTX domain handle
+ * @return 0 if the domain is not enabled.
+ */
+NVTX_DECLSPEC uint8_t NVTX_API nvtxDomainIsEnabled(
+    nvtxDomainHandle_t domain);
+
+/**
+ * \brief Report a push-pop range in a single call.
+ * \category NsysInternal
+ *
+ * This function is called at range pop. Thus, the NVTX handler will immediately
+ * take a timestamp (if timing is desired). The timestamp of the push operation
+ * is passed as argument and can be retrieved via `nvtxTimestampGet()`.
+ *
+ * The NVTX handler can assume that no other push operation happend in the same
+ * domain in between the push and the pop time of the reported range.
+ *
+ * @param domain The domain of scoping.
+ * @param eventAttrib The event attribute structure defining the range's
+ * attribute types and attribute values.
+ * @param pushTime The timestamp of the push operation (use `nvtxTimestampGet()`).
+ */
+NVTX_DECLSPEC void NVTX_API nvtxRangePushPop(nvtxDomainHandle_t domain,
+    const nvtxEventAttributes_t* eventAttrib, uint64_t pushTime);
+
+/**
+ * \brief Get a timestamp from the attached NVTX handler/tool.
+ *
+ * The timestamp is intended to be passed ...
+ * The time source is assumed to be TSC.
+ */
+NVTX_DECLSPEC int64_t NVTX_API nvtxTimestampGet(void);
+
+#endif /* NVTX_PAYLOAD_API_FUNCTIONS_V1 */
+
+#ifndef NVTX_PAYLOAD_CALLBACK_ID_V1
+#define NVTX_PAYLOAD_CALLBACK_ID_V1
+/**
+ * \brief Callback Ids of API functions in the payload extension.
+ *
+ * The NVTX handler can use these values to register a handler function. When
+ * InitializeInjectionNvtxExtension(nvtxExtModuleInfo_t* moduleInfo) is
+ * executed, a handler routine 'handlenvtxPayloadRegisterSchema' can be
+ * registered as follows:
+ * \code{.c}
+ *      moduleInfo->segments->slots[NVTX3EXT_CBID_nvtxPayloadSchemaRegister] =
+ *          (intptr_t)YourPayloadRegisterSchemaHandlerFn;
+ * \endcode
+ */
+#define NVTX3EXT_CBID_nvtxPayloadSchemaRegister      0
+#define NVTX3EXT_CBID_nvtxPayloadEnumRegister        1
+#define NVTX3EXT_CBID_nvtxMarkPayload                2
+#define NVTX3EXT_CBID_nvtxRangePushPayload           3
+#define NVTX3EXT_CBID_nvtxRangePopPayload            4
+#define NVTX3EXT_CBID_nvtxRangeStartPayload          5
+#define NVTX3EXT_CBID_nvtxRangeEndPayload            6
+#define NVTX3EXT_CBID_nvtxDomainIsEnabled            7
+#define NVTX3EXT_CBID_nvtxTimestampGet               8
+#define NVTX3EXT_CBID_nvtxScopeRegister             12
+
+/* NSys internal use only. */
+#define NVTX3EXT_CBID_nvtxRangePushPop              62
+#endif /* NVTX_PAYLOAD_CALLBACK_ID_V1 */
+
+/*** Helper utilities ***/
+
+/** \brief  Helper macro for safe double-cast of pointer to uint64_t value. */
+#ifndef NVTX_POINTER_AS_PAYLOAD_ULLVALUE
+# ifdef __cplusplus
+# define NVTX_POINTER_AS_PAYLOAD_ULLVALUE(p) \
+    static_cast<uint64_t>(reinterpret_cast<uintptr_t>(p))
+# else
+#define NVTX_POINTER_AS_PAYLOAD_ULLVALUE(p) ((uint64_t)(uintptr_t)p)
+# endif
+#endif
+
+#ifndef NVTX_PAYLOAD_EVTATTR_SET_DATA
+/**
+ * \brief Helper macro to attach a single payload to an NVTX event attribute.
+ *
+ * @param evtAttr NVTX event attribute (variable name)
+ * @param pldata_addr Adress of `nvtxPayloadData_t` variable.
+ * @param schema_id NVTX binary payload schema ID.
+ * @param pl_addr Address of the (actual) payload.
+ * @param sz size of the (actual) payload.
+ */
+#define NVTX_PAYLOAD_EVTATTR_SET_DATA(evtAttr, pldata_addr, schema_id, pl_addr, sz) \
+    (pldata_addr)->schemaId = schema_id; \
+    (pldata_addr)->size = sz; \
+    (pldata_addr)->payload = pl_addr; \
+    (evtAttr).payload.ullValue = NVTX_POINTER_AS_PAYLOAD_ULLVALUE(pldata_addr); \
+    (evtAttr).payloadType = NVTX_PAYLOAD_TYPE_EXT; \
+    (evtAttr).reserved0 = 1;
+#endif /* NVTX_PAYLOAD_EVTATTR_SET_DATA */
+
+#ifndef NVTX_PAYLOAD_EVTATTR_SET_MULTIPLE
+/**
+ * \brief Helper macro to attach multiple payloads to an NVTX event attribute.
+ *
+ * @param evtAttr NVTX event attribute (variable name)
+ * @param pldata Payload data array (of type `nvtxPayloadData_t`)
+ */
+#define NVTX_PAYLOAD_EVTATTR_SET_MULTIPLE(evtAttr, pldata) \
+    (evtAttr).payloadType = NVTX_PAYLOAD_TYPE_EXT; \
+    (evtAttr).reserved0 = sizeof(pldata)/sizeof(nvtxPayloadData_t); \
+    (evtAttr).payload.ullValue = NVTX_POINTER_AS_PAYLOAD_ULLVALUE(pldata);
+#endif /* NVTX_PAYLOAD_EVTATTR_SET_MULTIPLE */
+
+#ifndef NVTX_PAYLOAD_EVTATTR_SET
+/*
+ * Do not use this macro directly! It is a helper to attach a single payload to
+ * an NVTX event attribute.
+ * @warning The NVTX push, start or mark operation must not be in an outer scope.
+ */
+#define NVTX_PAYLOAD_EVTATTR_SET(evtAttr, schema_id, pl_addr, sz) \
+    nvtxPayloadData_t _NVTX_PAYLOAD_DATA_VAR[] = \
+        {{schema_id, sz, pl_addr}}; \
+    (evtAttr)->payload.ullValue = \
+        NVTX_POINTER_AS_PAYLOAD_ULLVALUE(_NVTX_PAYLOAD_DATA_VAR); \
+    (evtAttr)->payloadType = NVTX_PAYLOAD_TYPE_EXT; \
+    (evtAttr)->reserved0 = 1;
+#endif /* NVTX_PAYLOAD_EVTATTR_SET */
+
+#ifndef nvtxPayloadRangePush
+/**
+ * \brief Helper macro to push a range with extended payload.
+ *
+ * @param domain NVTX domain handle (0 for default domain)
+ * @param evtAttr pointer to NVTX event attribute.
+ * @param schemaId NVTX payload schema ID
+ * @param plAddr Pointer to the binary data (actual payload)
+ * @param size Size of the binary payload data in bytes.
+ */
+#define nvtxPayloadRangePush(domain, evtAttr, schemaId, plAddr, size) \
+do { \
+    NVTX_PAYLOAD_EVTATTR_SET(evtAttr, schemaId, plAddr, size) \
+    nvtxDomainRangePushEx(domain, evtAttr); \
+} while (0)
+#endif /* nvtxPayloadRangePush */
+
+#ifndef nvtxPayloadMark
+/**
+ * \brief Helper macro to set a marker with extended payload.
+ *
+ * @param domain NVTX domain handle (0 for default domain)
+ * @param evtAttr pointer to NVTX event attribute.
+ * @param schemaId NVTX payload schema ID
+ * @param plAddr Pointer to the binary data (actual payload)
+ * @param size Size of the binary payload data in bytes.
+ */
+#define nvtxPayloadMark(domain, evtAttr, schemaId, plAddr, size) \
+do { \
+    NVTX_PAYLOAD_EVTATTR_SET(evtAttr, schemaId, plAddr, size) \
+    nvtxDomainMarkEx(domain, evtAttr); \
+} while (0)
+#endif /* nvtxPayloadMark */
+
+#ifdef __GNUC__
+#pragma GCC visibility push(internal)
+#endif
+
+/* Extension types are required for the implementation and the NVTX handler. */
+#define NVTX_EXT_TYPES_GUARD
+#include "nvtxExtDetail/nvtxExtTypes.h"
+#undef NVTX_EXT_TYPES_GUARD
+
+#ifndef NVTX_NO_IMPL
+#define NVTX_EXT_IMPL_PAYLOAD_GUARD
+#include "nvtxExtDetail/nvtxExtImplPayload_v1.h"
+#undef NVTX_EXT_IMPL_PAYLOAD_GUARD
+#endif /* NVTX_NO_IMPL */
+
+#ifdef __GNUC__
+#pragma GCC visibility pop
+#endif
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
diff --git a/src/main/cpp/profiler/nvtx3/nvtxExtDetail/nvtxExtImpl.h b/src/main/cpp/profiler/nvtx3/nvtxExtDetail/nvtxExtImpl.h
new file mode 100644
index 0000000000..dd215a35c6
--- /dev/null
+++ b/src/main/cpp/profiler/nvtx3/nvtxExtDetail/nvtxExtImpl.h
@@ -0,0 +1,102 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#ifndef NVTX_EXT_IMPL_GUARD
+#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
+#endif
+
+#ifndef NVTX_EXT_IMPL_H
+#define NVTX_EXT_IMPL_H
+/* ---- Include required platform headers ---- */
+
+#if defined(_WIN32)
+
+#include <Windows.h>
+
+#else
+#include <unistd.h>
+
+#if defined(__ANDROID__)
+#include <android/api-level.h>
+#endif
+
+#if defined(__linux__) || defined(__CYGWIN__)
+#include <sched.h>
+#endif
+
+#include <limits.h>
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <errno.h>
+
+#include <string.h>
+#include <sys/types.h>
+#include <pthread.h>
+#include <stdlib.h>
+#include <wchar.h>
+
+#endif
+
+/* ---- Define macros used in this file ---- */
+
+#ifdef NVTX_DEBUG_PRINT
+#ifdef __ANDROID__
+#include <android/log.h>
+#define NVTX_ERR(...) __android_log_print(ANDROID_LOG_ERROR, "NVTOOLSEXT", __VA_ARGS__);
+#define NVTX_INFO(...) __android_log_print(ANDROID_LOG_INFO, "NVTOOLSEXT", __VA_ARGS__);
+#else
+#include <stdio.h>
+#define NVTX_ERR(...) fprintf(stderr, "NVTX_ERROR: " __VA_ARGS__)
+#define NVTX_INFO(...) fprintf(stderr, "NVTX_INFO: " __VA_ARGS__)
+#endif
+#else /* !defined(NVTX_DEBUG_PRINT) */
+#define NVTX_ERR(...)
+#define NVTX_INFO(...)
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+/*
+#ifdef __GNUC__
+#pragma GCC visibility push(hidden)
+#endif
+*/
+#define NVTX_EXTENSION_FRESH 0
+#define NVTX_EXTENSION_DISABLED 1
+#define NVTX_EXTENSION_STARTING 2
+#define NVTX_EXTENSION_LOADED 3
+
+/* Function slots are local to each extension now! */
+typedef struct nvtxExtGlobals1_t
+{
+    NvtxExtInitializeInjectionFunc_t injectionFnPtr;
+} nvtxExtGlobals1_t;
+
+NVTX_LINKONCE_DEFINE_GLOBAL nvtxExtGlobals1_t NVTX_VERSIONED_IDENTIFIER(nvtxExtGlobals1) =
+{
+    (NvtxExtInitializeInjectionFunc_t)0
+};
+
+#define NVTX_EXT_INIT_GUARD
+#include "nvtxExtInit.h"
+#undef NVTX_EXT_INIT_GUARD
+/*
+#ifdef __GNUC__
+#pragma GCC visibility pop
+#endif
+*/
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
+
+#endif /* NVTX_EXT_IMPL_H */
\ No newline at end of file
diff --git a/src/main/cpp/profiler/nvtx3/nvtxExtDetail/nvtxExtImplPayload_v1.h b/src/main/cpp/profiler/nvtx3/nvtxExtDetail/nvtxExtImplPayload_v1.h
new file mode 100644
index 0000000000..a97810ed6a
--- /dev/null
+++ b/src/main/cpp/profiler/nvtx3/nvtxExtDetail/nvtxExtImplPayload_v1.h
@@ -0,0 +1,180 @@
+/*
+* Copyright 2021-2023  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#ifndef NVTX_EXT_IMPL_PAYLOAD_GUARD
+#error Never include this file directly -- it is automatically included by nvToolsExtPayload.h (except when NVTX_NO_IMPL is defined).
+#endif
+
+#define NVTX_EXT_IMPL_GUARD
+#include "nvtxExtImpl.h"
+#undef NVTX_EXT_IMPL_GUARD
+
+#ifndef NVTX_EXT_IMPL_PAYLOAD_V1
+#define NVTX_EXT_IMPL_PAYLOAD_V1
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/* Macros to create versioned symbols. */
+#define NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) \
+    NAME##_v##VERSION##_bpl##COMPATID
+#define NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L2(NAME, VERSION, COMPATID) \
+    NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID)
+#define NVTX_EXT_PAYLOAD_VERSIONED_ID(NAME) \
+    NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L2(NAME, NVTX_VERSION, NVTX_EXT_PAYLOAD_COMPATID)
+
+#ifdef NVTX_DISABLE
+
+#include "nvtxExtHelperMacros.h"
+
+#define NVTX_EXT_PAYLOAD_IMPL_FN_V1(ret_val, fn_name, signature, arg_names) \
+ret_val fn_name signature { \
+    NVTX_EXT_HELPER_UNUSED_ARGS arg_names \
+    return ((ret_val)(intptr_t)-1); \
+}
+
+#else /* NVTX_DISABLE */
+
+#include "nvtxExtPayloadTypeInfo.h"
+
+/*
+ * Function slots for the payload extension. First entry is the module state,
+ * initialized to `0` (`NVTX_EXTENSION_FRESH`).
+ */
+#define NVTX_EXT_PAYLOAD_SLOT_COUNT 63
+NVTX_LINKONCE_DEFINE_GLOBAL intptr_t
+NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX_EXT_PAYLOAD_SLOT_COUNT + 1]
+    = {0};
+
+/* Avoid warnings about missing prototype. */
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)(void);
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)()
+{
+    intptr_t* fnSlots = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots) + 1;
+    nvtxExtModuleSegment_t segment = {
+        0, /* unused (only one segment) */
+        NVTX_EXT_PAYLOAD_SLOT_COUNT,
+        fnSlots
+    };
+
+    nvtxExtModuleInfo_t module = {
+        NVTX_VERSION, sizeof(nvtxExtModuleInfo_t),
+        NVTX_EXT_PAYLOAD_MODULEID, NVTX_EXT_PAYLOAD_COMPATID,
+        1, &segment, /* number of segments, segments */
+        NULL, /* no export function needed */
+        /* bake type sizes and alignment information into program binary */
+        &(NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadTypeInfo))
+    };
+
+    NVTX_INFO( "%s\n", __FUNCTION__  );
+
+    NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce)(&module,
+        NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots));
+}
+
+#define NVTX_EXT_PAYLOAD_IMPL_FN_V1(ret_type, fn_name, signature, arg_names) \
+typedef ret_type (*fn_name##_impl_fntype)signature; \
+NVTX_DECLSPEC ret_type NVTX_API fn_name signature { \
+    intptr_t slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
+    if (slot != NVTX_EXTENSION_DISABLED) { \
+        if (slot != NVTX_EXTENSION_FRESH) { \
+            return (*(fn_name##_impl_fntype)slot) arg_names; \
+        } else { \
+            NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)(); \
+            /* Re-read function slot after extension initialization. */ \
+            slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
+            if (slot != NVTX_EXTENSION_DISABLED && slot != NVTX_EXTENSION_FRESH) { \
+                return (*(fn_name##_impl_fntype)slot) arg_names; \
+            } \
+        } \
+    } \
+    NVTX_EXT_FN_RETURN_INVALID(ret_type) \
+}
+
+#define NVTX_EXT_PAYLOAD_IMPL_FN_V1_VOID(fn_name, signature, arg_names) \
+typedef void (*fn_name##_impl_fntype)signature; \
+NVTX_DECLSPEC void NVTX_API fn_name signature { \
+    intptr_t slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
+    if (slot != NVTX_EXTENSION_DISABLED) { \
+        if (slot != NVTX_EXTENSION_FRESH) { \
+            (*(fn_name##_impl_fntype)slot) arg_names; \
+        } else { \
+            NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)(); \
+            /* Re-read function slot after extension initialization. */ \
+            slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
+            if (slot != NVTX_EXTENSION_DISABLED && slot != NVTX_EXTENSION_FRESH) { \
+                (*(fn_name##_impl_fntype)slot) arg_names; \
+            } \
+        } \
+    } \
+}
+
+#endif /*NVTX_DISABLE*/
+
+/* Non-void functions. */
+#define NVTX_EXT_FN_RETURN_INVALID(rtype) return ((rtype)(intptr_t)-1);
+
+NVTX_EXT_PAYLOAD_IMPL_FN_V1(uint64_t, nvtxPayloadSchemaRegister,
+    (nvtxDomainHandle_t domain, const nvtxPayloadSchemaAttr_t* attr),
+    (domain, attr))
+
+NVTX_EXT_PAYLOAD_IMPL_FN_V1(uint64_t, nvtxPayloadEnumRegister,
+    (nvtxDomainHandle_t domain, const nvtxPayloadEnumAttr_t* attr),
+    (domain, attr))
+
+NVTX_EXT_PAYLOAD_IMPL_FN_V1(int, nvtxRangePushPayload,
+    (nvtxDomainHandle_t domain, const nvtxPayloadData_t* payloadData, size_t count),
+    (domain, payloadData, count))
+
+NVTX_EXT_PAYLOAD_IMPL_FN_V1(int, nvtxRangePopPayload,
+    (nvtxDomainHandle_t domain, const nvtxPayloadData_t* payloadData, size_t count),
+    (domain, payloadData, count))
+
+NVTX_EXT_PAYLOAD_IMPL_FN_V1(nvtxRangeId_t, nvtxRangeStartPayload,
+    (nvtxDomainHandle_t domain, const nvtxPayloadData_t* payloadData, size_t count),
+    (domain, payloadData, count))
+
+NVTX_EXT_PAYLOAD_IMPL_FN_V1(uint8_t, nvtxDomainIsEnabled, (nvtxDomainHandle_t domain), (domain))
+
+/* Experimental */
+NVTX_EXT_PAYLOAD_IMPL_FN_V1(int64_t, nvtxTimestampGet, (void), ())
+
+NVTX_EXT_PAYLOAD_IMPL_FN_V1(uint64_t, nvtxScopeRegister, (nvtxDomainHandle_t domain,
+    const nvtxScopeAttr_t* attr), (domain, attr))
+
+#undef NVTX_EXT_FN_RETURN_INVALID
+/* END: Non-void functions. */
+
+/* void functions. */
+#define NVTX_EXT_FN_RETURN_INVALID(rtype)
+#define return
+
+NVTX_EXT_PAYLOAD_IMPL_FN_V1_VOID(nvtxMarkPayload, (nvtxDomainHandle_t domain,
+    const nvtxPayloadData_t* payloadData, size_t count), (domain, payloadData, count))
+
+NVTX_EXT_PAYLOAD_IMPL_FN_V1_VOID(nvtxRangeEndPayload, (nvtxDomainHandle_t domain,
+    nvtxRangeId_t id, const nvtxPayloadData_t* payloadData, size_t count),
+    (domain, id, payloadData, count))
+
+#undef return
+#undef NVTX_EXT_FN_RETURN_INVALID
+/* END: void functions. */
+
+NVTX_EXT_PAYLOAD_IMPL_FN_V1_VOID(nvtxRangePushPop, (nvtxDomainHandle_t domain,
+    const nvtxEventAttributes_t* evtAttr, uint64_t pushTime),
+    (domain, evtAttr, pushTime))
+
+/* Keep NVTX_EXT_PAYLOAD_IMPL_FN_V1 defined for a future version of this extension. */
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
+
+#endif /* NVTX_EXT_IMPL_PAYLOAD_V1 */
+
diff --git a/src/main/cpp/profiler/nvtx3/nvtxExtDetail/nvtxExtInit.h b/src/main/cpp/profiler/nvtx3/nvtxExtDetail/nvtxExtInit.h
new file mode 100644
index 0000000000..743e55b938
--- /dev/null
+++ b/src/main/cpp/profiler/nvtx3/nvtxExtDetail/nvtxExtInit.h
@@ -0,0 +1,378 @@
+/*
+* Copyright 2009-2023  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#ifndef NVTX_EXT_INIT_GUARD
+#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/* ---- Platform-independent helper definitions and functions ---- */
+
+/* Prefer macros over inline functions to reduce symbol resolution at link time */
+
+#if defined(_WIN32)
+#define NVTX_PATHCHAR   wchar_t
+#define NVTX_STR(x)     L##x
+#define NVTX_GETENV     _wgetenv
+#define NVTX_BUFSIZE    MAX_PATH
+#define NVTX_DLLHANDLE  HMODULE
+#define NVTX_DLLOPEN(x) LoadLibraryW(x)
+#define NVTX_DLLFUNC    GetProcAddress
+#define NVTX_DLLCLOSE   FreeLibrary
+#define NVTX_YIELD()    SwitchToThread()
+#define NVTX_MEMBAR()   MemoryBarrier()
+#define NVTX_ATOMIC_WRITE_32(address, value)                        InterlockedExchange((volatile LONG*)address, value)
+#define NVTX_ATOMIC_CAS_32(old, address, exchange, comparand) old = InterlockedCompareExchange((volatile LONG*)address, exchange, comparand)
+#define NVTX_ATOMIC_WRITE_PTR(address, value)                        InterlockedExchangePointer((volatile PVOID*)address, (PVOID)value)
+#define NVTX_ATOMIC_CAS_PTR(old, address, exchange, comparand) old = (intptr_t)InterlockedCompareExchangePointer((volatile PVOID*)address, (PVOID)exchange, (PVOID)comparand)
+
+
+#elif defined(__GNUC__)
+#define NVTX_PATHCHAR   char
+#define NVTX_STR(x)     x
+#define NVTX_GETENV     getenv
+#define NVTX_BUFSIZE    PATH_MAX
+#define NVTX_DLLHANDLE  void*
+#define NVTX_DLLOPEN(x) dlopen(x, RTLD_LAZY)
+#define NVTX_DLLFUNC    dlsym
+#define NVTX_DLLCLOSE   dlclose
+#define NVTX_YIELD()    sched_yield()
+#define NVTX_MEMBAR()   __sync_synchronize()
+/* Ensure full memory barrier for atomics, to match Windows functions. */
+#define NVTX_ATOMIC_WRITE_32(address, value)                  __sync_synchronize();       __sync_lock_test_and_set(address, value)
+#define NVTX_ATOMIC_CAS_32(old, address, exchange, comparand) __sync_synchronize(); old = __sync_val_compare_and_swap(address, exchange, comparand)
+#define NVTX_ATOMIC_WRITE_PTR(address, value)                  __sync_synchronize();       __sync_lock_test_and_set(address, value)
+#define NVTX_ATOMIC_CAS_PTR(old, address, exchange, comparand) __sync_synchronize(); old = __sync_val_compare_and_swap(address, exchange, comparand)
+#else
+#error The library does not support your configuration!
+#endif
+
+/* Define this to 1 for platforms that where pre-injected libraries can be discovered. */
+#if defined(_WIN32)
+/* TODO */
+#define NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY 0
+#else
+#define NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY 0
+#endif
+
+/* Define this to 1 for platforms that support environment variables. */
+/* TODO: Detect UWP, a.k.a. Windows Store app, and set this to 0. */
+/* Try:  #if defined(WINAPI_FAMILY_PARTITION) && WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) */
+#define NVTX_SUPPORT_ENV_VARS 1
+
+/* Define this to 1 for platforms that support dynamic/shared libraries */
+#define NVTX_SUPPORT_DYNAMIC_INJECTION_LIBRARY 1
+
+/* Injection libraries implementing InitializeInjectionNvtxExtension may be statically linked,
+ * which will override any dynamic injection. This is useful for platforms, where dynamic
+ * injection is not available. Since weak symbols, not explicitly marked extern, are
+ * guaranteed to be initialized to zero, if no definitions are found by the linker, the
+ * dynamic injection process proceeds normally, if pfnInitializeInjectionNvtx2 is 0. */
+#if defined(__GNUC__) && !defined(_WIN32) && !defined(__CYGWIN__)
+#define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 1
+/* To statically inject an NVTX library, define InitializeInjectionNvtxExtension_fnptr as a normal
+ * symbol (not weak) pointing to the implementation of InitializeInjectionNvtxExtension, which
+ * does not need to be named "InitializeInjectionNvtxExtension" as it is necessary in a dynamic
+ * injection library. */
+__attribute__((weak)) NvtxExtInitializeInjectionFunc_t InitializeInjectionNvtxExtension_fnptr;
+#else
+#define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 0
+#endif
+
+
+
+/* This function tries to find or load an NVTX injection library and get the address of its
+ * `InitializeInjectionExtension` function. If such a function pointer is found, it is called and
+ * passed the address of this NVTX instance's `nvtxGetExportTable` function, so that the injection
+ * can attach to this instance.
+ * If the initialization fails for any reason, any dynamic library loaded will  be freed, and all
+ * NVTX implementation functions will be set to no-ops. If the initialization succeeds, NVTX
+ * functions that are not attached to the tool will be set to no-ops. This is implemented as one
+ * function instead of several small functions to minimize the number of weak symbols the linker
+ * must resolve. The order of search is:
+ *  1) Pre-injected library exporting InitializeInjectionNvtxExtension
+ *  2) Loadable library exporting InitializeInjectionNvtxExtension
+ *      - Path specified by env var NVTX_INJECTION??_PATH (?? is 32 or 64)
+ *      - On Android, libNvtxInjection??.so within the package (?? is 32 or 64)
+ *  3) Statically-linked injection library defining InitializeInjectionNvtx2_fnptr
+ */
+NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)(
+    NvtxExtInitializeInjectionFunc_t* out_init_fnptr);
+NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)(
+    NvtxExtInitializeInjectionFunc_t* out_init_fnptr)
+{
+    const char* const initFuncName = "InitializeInjectionNvtxExtension";
+    NvtxExtInitializeInjectionFunc_t init_fnptr = (NvtxExtInitializeInjectionFunc_t)0;
+    NVTX_DLLHANDLE injectionLibraryHandle = (NVTX_DLLHANDLE)0;
+
+    if (out_init_fnptr)
+    {
+        *out_init_fnptr = (NvtxExtInitializeInjectionFunc_t)0;
+    }
+
+#if NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY
+    /* Use POSIX global symbol chain to query for init function from any module. */
+    init_fnptr = (NvtxExtInitializeInjectionFunc_t)NVTX_DLLFUNC(0, initFuncName);
+#endif
+
+#if NVTX_SUPPORT_DYNAMIC_INJECTION_LIBRARY
+    /* Try discovering dynamic injection library to load */
+    if (!init_fnptr)
+    {
+#if NVTX_SUPPORT_ENV_VARS
+        /* If env var NVTX_INJECTION64_PATH is set, it should contain the path
+           to a 64-bit dynamic NVTX injection library (and similar for 32-bit). */
+        const NVTX_PATHCHAR* const nvtxEnvVarName = (sizeof(void*) == 4)
+            ? NVTX_STR("NVTX_INJECTION32_PATH")
+            : NVTX_STR("NVTX_INJECTION64_PATH");
+#endif /* NVTX_SUPPORT_ENV_VARS */
+        NVTX_PATHCHAR injectionLibraryPathBuf[NVTX_BUFSIZE];
+        const NVTX_PATHCHAR* injectionLibraryPath = (const NVTX_PATHCHAR*)0;
+
+        /* Refer to this variable explicitly in case all references to it are #if'ed out. */
+        (void)injectionLibraryPathBuf;
+
+#if NVTX_SUPPORT_ENV_VARS
+        /* Disable the warning for getenv & _wgetenv -- this usage is safe because
+           these functions are not called again before using the returned value. */
+#if defined(_MSC_VER)
+#pragma warning( push )
+#pragma warning( disable : 4996 )
+#endif
+        injectionLibraryPath = NVTX_GETENV(nvtxEnvVarName);
+#if defined(_MSC_VER)
+#pragma warning( pop )
+#endif
+#endif
+
+#if defined(__ANDROID__)
+        if (!injectionLibraryPath)
+        {
+            const char *bits = (sizeof(void*) == 4) ? "32" : "64";
+            char cmdlineBuf[32];
+            char pkgName[PATH_MAX];
+            int count;
+            int pid;
+            FILE *fp;
+            size_t bytesRead;
+            size_t pos;
+
+            pid = (int)getpid();
+            count = snprintf(cmdlineBuf, sizeof(cmdlineBuf), "/proc/%d/cmdline", pid);
+            if (count <= 0 || count >= (int)sizeof(cmdlineBuf))
+            {
+                NVTX_ERR("Path buffer too small for: /proc/%d/cmdline\n", pid);
+                return NVTX_ERR_INIT_ACCESS_LIBRARY;
+            }
+
+            fp = fopen(cmdlineBuf, "r");
+            if (!fp)
+            {
+                NVTX_ERR("File couldn't be opened: %s\n", cmdlineBuf);
+                return NVTX_ERR_INIT_ACCESS_LIBRARY;
+            }
+
+            bytesRead = fread(pkgName, 1, sizeof(pkgName) - 1, fp);
+            fclose(fp);
+            if (bytesRead == 0)
+            {
+                NVTX_ERR("Package name couldn't be read from file: %s\n", cmdlineBuf);
+                return NVTX_ERR_INIT_ACCESS_LIBRARY;
+            }
+
+            pkgName[bytesRead] = 0;
+
+            /* String can contain colon as a process separator. In this case the
+               package name is before the colon. */
+            pos = 0;
+            while (pos < bytesRead && pkgName[pos] != ':' && pkgName[pos] != '\0')
+            {
+                ++pos;
+            }
+            pkgName[pos] = 0;
+
+            count = snprintf(injectionLibraryPathBuf, NVTX_BUFSIZE, "/data/data/%s/files/libNvtxInjection%s.so", pkgName, bits);
+            if (count <= 0 || count >= NVTX_BUFSIZE)
+            {
+                NVTX_ERR("Path buffer too small for: /data/data/%s/files/libNvtxInjection%s.so\n", pkgName, bits);
+                return NVTX_ERR_INIT_ACCESS_LIBRARY;
+            }
+
+            /* On Android, verify path is accessible due to aggressive file access restrictions. */
+            /* For dlopen, if the filename contains a leading slash, then it is interpreted as a */
+            /* relative or absolute pathname; otherwise it will follow the rules in ld.so. */
+            if (injectionLibraryPathBuf[0] == '/')
+            {
+#if (__ANDROID_API__ < 21)
+                int access_err = access(injectionLibraryPathBuf, F_OK | R_OK);
+#else
+                int access_err = faccessat(AT_FDCWD, injectionLibraryPathBuf, F_OK | R_OK, 0);
+#endif
+                if (access_err != 0)
+                {
+                    NVTX_ERR("Injection library path wasn't accessible [code=%s] [path=%s]\n", strerror(errno), injectionLibraryPathBuf);
+                    return NVTX_ERR_INIT_ACCESS_LIBRARY;
+                }
+            }
+            injectionLibraryPath = injectionLibraryPathBuf;
+        }
+#endif
+
+        /* At this point, `injectionLibraryPath` is specified if a dynamic
+           injection library was specified by a tool. */
+        if (injectionLibraryPath)
+        {
+            /* Load the injection library */
+            injectionLibraryHandle = NVTX_DLLOPEN(injectionLibraryPath);
+            if (!injectionLibraryHandle)
+            {
+                NVTX_ERR("Failed to load injection library\n");
+                return NVTX_ERR_INIT_LOAD_LIBRARY;
+            }
+            else
+            {
+                /* Attempt to get the injection library's entry-point. */
+                init_fnptr = (NvtxExtInitializeInjectionFunc_t)NVTX_DLLFUNC(injectionLibraryHandle, initFuncName);
+                if (!init_fnptr)
+                {
+                    NVTX_DLLCLOSE(injectionLibraryHandle);
+                    NVTX_ERR("Failed to get address of function %s from injection library\n", initFuncName);
+                    return NVTX_ERR_INIT_MISSING_LIBRARY_ENTRY_POINT;
+                }
+            }
+        }
+    }
+#endif
+
+#if NVTX_SUPPORT_STATIC_INJECTION_LIBRARY
+    if (!init_fnptr)
+    {
+        /* Check weakly-defined function pointer.  A statically-linked injection can define
+           this as a normal symbol and it will take precedence over a dynamic injection. */
+        if (InitializeInjectionNvtxExtension_fnptr)
+        {
+            init_fnptr = InitializeInjectionNvtxExtension_fnptr;
+        }
+    }
+#endif
+
+    if (out_init_fnptr)
+    {
+        *out_init_fnptr = init_fnptr;
+    }
+
+    /* At this point, if `init_fnptr` is not set, no tool has specified an NVTX injection library.
+       Non-success result is returned, so that all NVTX API functions will be set to no-ops. */
+    if (!init_fnptr)
+    {
+        return NVTX_ERR_NO_INJECTION_LIBRARY_AVAILABLE;
+    }
+
+    return NVTX_SUCCESS;
+}
+
+/* Avoid warnings about missing prototypes. */
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce) (
+    nvtxExtModuleInfo_t* moduleInfo, intptr_t* moduleState);
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce) (
+    nvtxExtModuleInfo_t* moduleInfo, intptr_t* moduleState)
+{
+    intptr_t old;
+
+    NVTX_INFO( "%s\n", __FUNCTION__ );
+
+    if (*moduleState == NVTX_EXTENSION_LOADED)
+    {
+        NVTX_INFO("Module loaded\n");
+        return;
+    }
+
+    NVTX_ATOMIC_CAS_PTR(
+        old,
+        moduleState,
+        NVTX_EXTENSION_STARTING,
+        NVTX_EXTENSION_FRESH);
+    if (old == NVTX_EXTENSION_FRESH)
+    {
+        NvtxExtInitializeInjectionFunc_t init_fnptr =
+            NVTX_VERSIONED_IDENTIFIER(nvtxExtGlobals1).injectionFnPtr;
+        int entryPointStatus = 0;
+        int forceAllToNoops = 0;
+        size_t s;
+
+        /* Load and initialize injection library, which will assign the function pointers. */
+        if (init_fnptr == 0)
+        {
+            int result = 0;
+
+            /* Try to load vanilla NVTX first. */
+            nvtxInitialize(0);
+
+            result = NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)(&init_fnptr);
+            /* At this point `init_fnptr` will be either 0 or a real function. */
+
+            if (result == NVTX_SUCCESS)
+            {
+                NVTX_VERSIONED_IDENTIFIER(nvtxExtGlobals1).injectionFnPtr = init_fnptr;
+            }
+            else
+            {
+                NVTX_ERR("Failed to load injection library\n");
+            }
+        }
+
+        if (init_fnptr != 0)
+        {
+            /* Invoke injection library's initialization function. If it returns
+               0 (failure) and a dynamic injection was loaded, unload it. */
+            entryPointStatus = init_fnptr(moduleInfo);
+            if (entryPointStatus == 0)
+            {
+                NVTX_ERR("Failed to initialize injection library -- initialization function returned 0\n");
+            }
+        }
+
+        /* Clean up any functions that are still uninitialized so that they are
+           skipped. Set all to null if injection init function failed as well. */
+        forceAllToNoops = (init_fnptr == 0) || (entryPointStatus == 0);
+        for (s = 0; s < moduleInfo->segmentsCount; ++s)
+        {
+            nvtxExtModuleSegment_t* segment = moduleInfo->segments + s;
+            size_t i;
+            for (i = 0; i < segment->slotCount; ++i)
+            {
+                if (forceAllToNoops || (segment->functionSlots[i] == NVTX_EXTENSION_FRESH))
+                {
+                    segment->functionSlots[i] = NVTX_EXTENSION_DISABLED;
+                }
+            }
+        }
+
+        NVTX_MEMBAR();
+
+        /* Signal that initialization has finished and the assigned function
+           pointers will be used. */
+        NVTX_ATOMIC_WRITE_PTR(moduleState, NVTX_EXTENSION_LOADED);
+    }
+    else /* Spin-wait until initialization has finished. */
+    {
+        NVTX_MEMBAR();
+        while (*moduleState != NVTX_EXTENSION_LOADED)
+        {
+            NVTX_YIELD();
+            NVTX_MEMBAR();
+        }
+    }
+}
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
diff --git a/src/main/cpp/profiler/nvtx3/nvtxExtDetail/nvtxExtPayloadTypeInfo.h b/src/main/cpp/profiler/nvtx3/nvtxExtDetail/nvtxExtPayloadTypeInfo.h
new file mode 100644
index 0000000000..6a30e6633a
--- /dev/null
+++ b/src/main/cpp/profiler/nvtx3/nvtxExtDetail/nvtxExtPayloadTypeInfo.h
@@ -0,0 +1,151 @@
+/*
+* Copyright 2021-2023  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#ifndef NVTX_EXT_IMPL_PAYLOAD_GUARD
+#error Never include this file directly -- it is automatically included by nvToolsExtPayload.h (except when NVTX_NO_IMPL is defined).
+#endif
+
+typedef void* nvtx_payload_pointer_type;
+
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L)
+#include <uchar.h>
+#include <stdalign.h>
+#endif
+
+/* `alignof` is available as of C11 or C++11. */
+#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || (defined(__cplusplus) && __cplusplus >= 201103L)
+
+#define nvtx_alignof(type) alignof(type)
+#define nvtx_alignof2(type,tname) alignof(type)
+
+#else /* (__STDC_VERSION__ >= 201112L) || (__cplusplus >= 201103L) */
+
+/* Create helper structs to determine type alignment. */
+#define MKTYPEDEF(type) typedef struct {char c; type d;} _nvtx_##type
+#define MKTYPEDEF2(type,tname) typedef struct {char c; type d;} _nvtx_##tname
+
+MKTYPEDEF(char);
+MKTYPEDEF2(unsigned char, uchar);
+MKTYPEDEF(short);
+MKTYPEDEF2(unsigned short, ushort);
+MKTYPEDEF(int);
+MKTYPEDEF2(unsigned int, uint);
+MKTYPEDEF(long);
+MKTYPEDEF2(unsigned long, ulong);
+MKTYPEDEF2(long long, longlong);
+MKTYPEDEF2(unsigned long long, ulonglong);
+
+MKTYPEDEF(int8_t);
+MKTYPEDEF(uint8_t);
+MKTYPEDEF(int16_t);
+MKTYPEDEF(uint16_t);
+MKTYPEDEF(int32_t);
+MKTYPEDEF(uint32_t);
+MKTYPEDEF(int64_t);
+MKTYPEDEF(uint64_t);
+
+MKTYPEDEF(float);
+MKTYPEDEF(double);
+MKTYPEDEF2(long double, longdouble);
+
+MKTYPEDEF(size_t);
+MKTYPEDEF(nvtx_payload_pointer_type);
+
+MKTYPEDEF(wchar_t);
+
+/* `char8_t` is available as of C++20 or C23 */
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) || (defined(__cplusplus) && __cplusplus >= 201811L)
+    MKTYPEDEF(char8_t);
+#endif
+
+/* `char16_t` and `char32_t` are available as of C++11 or C11 */
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 200704L)
+    MKTYPEDEF(char16_t);
+    MKTYPEDEF(char32_t);
+#endif
+
+/* C requires to include stddef.h to use `offsetof` */
+#ifndef __cplusplus
+#include <stddef.h>
+#endif
+
+#define nvtx_alignof(tname) offsetof(_nvtx_##tname, d)
+#define nvtx_alignof2(type, tname) offsetof(_nvtx_##tname, d)
+
+#endif /*  __STDC_VERSION__ >= 201112L */
+
+#undef MKTYPEDEF
+#undef MKTYPEDEF2
+
+/*
+ * Helper array to get the alignment for each predefined C/C++ language type.
+ * The order of entries must match the values in`enum nvtxPayloadSchemaEntryType`.
+ *
+ * In C++, `const` variables use internal linkage by default, but we need it to
+ * be public (extern) since weak declarations must be public.
+ */
+NVTX_LINKONCE_DEFINE_GLOBAL
+#ifdef __cplusplus
+extern
+#endif
+const nvtxPayloadEntryTypeInfo_t
+NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadTypeInfo)[NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE] =
+{
+    /* The first entry contains this array's length and the size of each entry in this array. */
+    {NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE, sizeof(nvtxPayloadEntryTypeInfo_t)},
+
+    /*** C integer types ***/
+    /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR */   {sizeof(char), nvtx_alignof(char)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_UCHAR */  {sizeof(unsigned char), nvtx_alignof2(unsigned char, uchar)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_SHORT */  {sizeof(short), nvtx_alignof(short)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_USHORT */ {sizeof(unsigned short), nvtx_alignof2(unsigned short, ushort)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_INT */    {sizeof(int), nvtx_alignof(int)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_UINT */   {sizeof(unsigned int), nvtx_alignof2(unsigned int, uint)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_LONG */   {sizeof(long), nvtx_alignof(long)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_ULONG */  {sizeof(unsigned long), nvtx_alignof2(unsigned long, ulong)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_LONGLONG */  {sizeof(long long), nvtx_alignof2(long long, longlong)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_ULONGLONG */ {sizeof(unsigned long long), nvtx_alignof2(unsigned long long,ulonglong)},
+
+    /*** Integer types with explicit size ***/
+    /* NVTX_PAYLOAD_ENTRY_TYPE_INT8 */   {sizeof(int8_t),   nvtx_alignof(int8_t)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_UINT8 */  {sizeof(uint8_t),  nvtx_alignof(uint8_t)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_INT16 */  {sizeof(int16_t),  nvtx_alignof(int16_t)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_UINT16 */ {sizeof(uint16_t), nvtx_alignof(uint16_t)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_INT32 */  {sizeof(int32_t),  nvtx_alignof(int32_t)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_UINT32 */ {sizeof(uint32_t), nvtx_alignof(uint32_t)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_INT64 */  {sizeof(int64_t),  nvtx_alignof(int64_t)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_UINT64 */ {sizeof(uint64_t), nvtx_alignof(uint64_t)},
+
+    /*** C floating point types ***/
+    /* NVTX_PAYLOAD_ENTRY_TYPE_FLOAT */      {sizeof(float),       nvtx_alignof(float)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_DOUBLE */     {sizeof(double),      nvtx_alignof(double)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_LONGDOUBLE */ {sizeof(long double), nvtx_alignof2(long double, longdouble)},
+
+    /* NVTX_PAYLOAD_ENTRY_TYPE_SIZE */    {sizeof(size_t),       nvtx_alignof(size_t)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_ADDRESS */ {sizeof(nvtx_payload_pointer_type), nvtx_alignof(nvtx_payload_pointer_type)},
+
+    /*** Special character types ***/
+    /* NVTX_PAYLOAD_ENTRY_TYPE_WCHAR */ {sizeof(wchar_t), nvtx_alignof(wchar_t)},
+
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) || (defined(__cplusplus) && __cplusplus >= 201811L)
+    /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR8 */ {sizeof(char8_t), nvtx_alignof(char8_t)},
+#else
+    /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR8 */ {0, 0},
+#endif
+
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 200704L)
+    /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR16 */ {sizeof(char16_t), nvtx_alignof(char16_t)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR32 */ {sizeof(char32_t), nvtx_alignof(char32_t)}
+#else
+    /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR16 */ {0, 0},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR32 */ {0, 0}
+#endif
+};
+
+#undef nvtx_alignof
+#undef nvtx_alignof2
\ No newline at end of file
diff --git a/src/main/cpp/profiler/nvtx3/nvtxExtDetail/nvtxExtTypes.h b/src/main/cpp/profiler/nvtx3/nvtxExtDetail/nvtxExtTypes.h
new file mode 100644
index 0000000000..bcad095a0c
--- /dev/null
+++ b/src/main/cpp/profiler/nvtx3/nvtxExtDetail/nvtxExtTypes.h
@@ -0,0 +1,44 @@
+/*
+* Copyright 2021  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+/* This header defines types which are used by the internal implementation
+*  of NVTX and callback subscribers.  API clients do not use these types,
+*  so they are defined here instead of in nvToolsExt.h to clarify they are
+*  not part of the NVTX client API. */
+
+#ifndef NVTXEXTTYPES_H
+#define NVTXEXTTYPES_H
+
+#ifndef NVTX_EXT_TYPES_GUARD
+#error Never include this file directly -- it is automatically included by nvToolsExt[EXTENSION].h.
+#endif
+
+typedef intptr_t (NVTX_API * NvtxExtGetExportFunction_t)(uint32_t exportFunctionId);
+
+typedef struct nvtxExtModuleSegment_t
+{
+    size_t segmentId;
+    size_t slotCount;
+    intptr_t* functionSlots;
+} nvtxExtModuleSegment_t;
+
+typedef struct nvtxExtModuleInfo_t
+{
+    uint16_t nvtxVer;
+    uint16_t structSize;
+    uint16_t moduleId;
+    uint16_t compatId;
+    size_t segmentsCount;
+    nvtxExtModuleSegment_t* segments;
+    NvtxExtGetExportFunction_t getExportFunction;
+    const void* extInfo;
+} nvtxExtModuleInfo_t;
+
+typedef int (NVTX_API * NvtxExtInitializeInjectionFunc_t)(nvtxExtModuleInfo_t* moduleInfo);
+
+#endif /* NVTXEXTTYPES_H */
\ No newline at end of file
diff --git a/src/main/cpp/profiler/nvtxw3.cpp b/src/main/cpp/profiler/nvtxw3.cpp
new file mode 100644
index 0000000000..b18aba73da
--- /dev/null
+++ b/src/main/cpp/profiler/nvtxw3.cpp
@@ -0,0 +1,874 @@
+/*
+ *  Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  Licensed under the Apache License v2.0 with LLVM Exceptions.
+ *  See https://llvm.org/LICENSE.txt for license information.
+ *
+ *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+
+#if defined(_WIN32)
+#include <Windows.h>
+#else
+#include <unistd.h>
+#include <sys/types.h>
+#if defined (_QNX_SOURCE)
+#include <signal.h>
+#include <errno.h>
+#else
+#include <sys/signal.h>
+#endif
+#include <sys/wait.h>
+#include <sys/stat.h>
+#endif
+
+#if defined(__APPLE__)
+#include <libproc.h>
+#endif
+
+#include "nvtxw3.h"
+
+/*-------------------------------------------------------------*/
+/* Path string helpers -- implement here to avoid dependencies */
+
+#if defined(_WIN32)
+static const char pathSep = '\\';
+#if defined(NVTXW3_TEST_PATH_UTILITIES)
+static const char pathDelimiter = ';';
+#endif
+static const size_t initialPathBufSize = MAX_PATH; /* Grows if not big enough */
+#define NVTXW3_DLLHANDLE  HMODULE
+#define NVTXW3_DLLOPEN(x) LoadLibraryA(x)
+#define NVTXW3_DLLFUNC    GetProcAddress
+#define NVTXW3_DLLCLOSE   FreeLibrary
+#else
+static const char pathSep = '/';
+#if defined(NVTXW3_TEST_PATH_UTILITIES)
+static const char pathDelimiter = ':';
+#endif
+static const size_t initialPathBufSize = 260; /* Grows if not big enough */
+#define NVTXW3_DLLHANDLE  void*
+#define NVTXW3_DLLOPEN(x) dlopen(x, RTLD_LAZY)
+#define NVTXW3_DLLFUNC    dlsym
+#define NVTXW3_DLLCLOSE   dlclose
+#endif
+
+#if defined(NVTXW3_TEST_PATH_UTILITIES)
+/* If native path separator is not forward slash (e.g. backslash on Windows),
+*  do in-place conversion of forward slashes to native path separator. */
+static void ForwardSlashesToNative(char* path)
+{
+#if _WIN32
+    char* cur;
+    if (!path) return;
+    for (cur = path; *cur; ++cur)
+    {
+        if (*cur == '/') *cur = pathSep;
+    }
+#else
+    (void)path;
+#endif
+}
+#endif
+
+/* Take pointers to string buffer begin/end.  End must equal begin + strlen(begin),
+*  or NULL, in which case it will be set to begin + strlen(begin).
+*  Remove trailing slashes in-place by overwriting first trailing slash with null. */
+static void StripTrailingSlashes(char* path)
+{
+    char* newPathEnd;
+    char* pathEnd = path + strlen(path);
+
+    newPathEnd = pathEnd;
+    while (newPathEnd != path)
+    {
+        char* cur = newPathEnd - 1;
+        if (*cur != pathSep) break;
+        newPathEnd = cur;
+    }
+    if (newPathEnd != pathEnd)
+    {
+        *newPathEnd = '\0';
+    }
+}
+
+/* Take pointers to string buffer begin/end.  End must equal begin + strlen(begin),
+*  or NULL, in which case it will be set to begin + strlen(begin).
+*  Remove leading slashes in-place by memmove-ing from first character after leading
+*  slashes to beginning of buffer, including null terminator. */
+#if defined(NVTXW3_TEST_PATH_UTILITIES)
+static char* AfterLeadingSlashes(char* cur)
+{
+    for (; *cur && *cur == pathSep; ++cur);
+    return cur;
+}
+#endif
+static const char* AfterLeadingSlashesConst(const char* cur)
+{
+    for (; *cur && *cur == pathSep; ++cur);
+    return cur;
+}
+
+#if defined(NVTXW3_TEST_PATH_UTILITIES)
+/* Take pointers to string buffer begin/end.  End must equal begin + strlen(begin),
+*  or NULL, in which case it will be set to begin + strlen(begin).
+*  Remove leading slashes in-place by memmove-ing from first character after leading
+*  slashes to beginning of buffer, including null terminator. */
+static void StripLeadingSlashes(char* path)
+{
+    char* afterSlashes = AfterLeadingSlashes(path);
+    if (afterSlashes != path)
+    {
+        size_t sizeAfterSlashesWithNull = strlen(afterSlashes) + 1;
+        memmove(path, afterSlashes, sizeAfterSlashesWithNull);
+    }
+}
+#endif
+
+/* Take pointers to string buffer begin/end.  End must equal begin + strlen(begin),
+*  or NULL, in which case it will be set to begin + strlen(begin).
+*  Returns pointer to heap-allocated copy of input, must be freed with free(). */
+static char* AssignHeapString(char* lhs, const char* rhs)
+{
+    size_t lenWithNull;
+
+    if (!rhs) return NULL;
+
+    lenWithNull = strlen(rhs) + 1;
+    lhs = (char*)realloc(lhs, lenWithNull);
+    memcpy(lhs, rhs, lenWithNull);
+    return lhs;
+}
+
+static char* AssignHeapStringFromRange(char* lhs, const char* rhsBegin, const char* rhsEnd)
+{
+    size_t lenWithoutNull;
+
+    if (!rhsBegin || !rhsEnd) return NULL;
+
+    lenWithoutNull = rhsEnd - rhsBegin;
+    lhs = (char*)realloc(lhs, lenWithoutNull + 1);
+    memcpy(lhs, rhsBegin, lenWithoutNull);
+    lhs[lenWithoutNull] = '\0';
+    return lhs;
+}
+
+/* Take pointers to string buffer begin/end.  End must equal begin + strlen(begin),
+*  or NULL, in which case it will be set to begin + strlen(begin).
+*  Returns pointer to heap-allocated copy of input, must be freed with free(). */
+static char* MakeHeapString(const char* str)
+{
+    return AssignHeapString(NULL, str);
+}
+
+static char* MakeHeapStringFromRange(const char* strBegin, const char* strEnd)
+{
+    return AssignHeapStringFromRange(NULL, strBegin, strEnd);
+}
+
+#if defined(NVTXW3_TEST_PATH_UTILITIES)
+static char* MakeHeapStringWithNativeSlashes(const char* str)
+{
+    char* buf = AssignHeapString(NULL, str);
+    ForwardSlashesToNative(buf);
+    return buf;
+}
+
+/* Take pointer to a HeapString (lhs) and any C string (rhs), append rhs to lhs,
+*  reallocating the heap memory for lhs if necessary.  Returns pointer to result
+*  HeapString, which may or may not be the same pointer passed in as lhs.
+*  HeapString must be freed with free(). */
+static char* AppendToHeapString(char* lhs, const char* rhs)
+{
+    size_t lenLhs, lenRhs;
+    lenLhs = strlen(lhs);
+    lenRhs = strlen(rhs);
+    if (lenRhs == 0) return lhs;
+    lhs = (char*)realloc(lhs, lenLhs + lenRhs + 1);
+    memcpy(lhs + lenLhs, rhs, lenRhs + 1);
+    return lhs;
+}
+#endif
+
+/* Take pointer to a HeapString (lhs) and any C string (rhs), append rhs to lhs,
+*  with a path separator between them, reallocating the heap memory for lhs if
+*  necessary.  If rhs is null or empty, then the result is lhs unmodified.  If
+*  lhs is null or empty and rhs is not, then the result is a path separator
+*  followed by rhs.  Returns pointer to result HeapString, which may or may not
+*  be the same pointer passed in as lhs.  HeapString must be freed with free(). */
+static char* AppendToHeapStringWithSep(char* lhs, const char* rhs)
+{
+    size_t lenLhs, lenRhs;
+    lenLhs = strlen(lhs);
+    lenRhs = strlen(rhs);
+    if (lenRhs == 0) return lhs;
+    lhs = (char*)realloc(lhs, lenLhs + lenRhs + 2);
+    lhs[lenLhs] = pathSep;
+    memcpy(lhs + lenLhs + 1, rhs, lenRhs + 1);
+    return lhs;
+}
+
+/* dir is a HeapString.  If dir is empty or just slashes, result will be a
+*  path relative to the root, i.e. beginning with a path separator.
+*  relativePath must be a valid relative path (not empty, not just slashes).
+*  Returns pointer to result HeapString, which may or may not be the same
+*  pointer passed in as lhs.  HeapString must be freed with free(). */
+static char* AppendToPathHeapString(char* dir, const char* relativePath)
+{
+    const char* relPathAfterLeadingSlashes;
+    relPathAfterLeadingSlashes = AfterLeadingSlashesConst(relativePath);
+    StripTrailingSlashes(dir);
+    return AppendToHeapStringWithSep(dir, relPathAfterLeadingSlashes);
+}
+
+static char* LoadFileIntoHeapString(const char* filename)
+{
+    FILE* f;
+    char* buf;
+    int err;
+    long pos;
+    size_t size;
+    size_t bytesRead;
+
+    f = fopen(filename, "rb");
+    if (!f) return NULL;
+    err = fseek(f, 0, SEEK_END);
+    if (err) { fclose(f); return NULL; }
+    pos = ftell(f);
+    if (pos < 0) { fclose(f); return NULL; }
+    rewind(f);
+    size = (size_t)pos;
+
+    buf = (char*)malloc(size + 1);
+    if (!buf) { fclose(f); return NULL; }
+    bytesRead = fread(buf, 1, size, f);
+    if (bytesRead < size) { fclose(f); free(buf); return NULL; }
+
+    buf[size] = '\0';
+    fclose(f);
+    return buf;
+}
+
+#if defined(NVTXW3_TEST_PATH_UTILITIES)
+static int HasSlashes(const char* cur)
+{
+    for (; *cur; ++cur)
+    {
+        if (*cur == pathSep) return 1;
+    }
+    return 0;
+}
+
+static int HasTrailingSlash(const char* str)
+{
+    size_t len = strlen(str);
+    if (len == 0) return 0;
+    return str[len-1] == pathSep;
+}
+#endif
+
+static char* GetCurrentWorkingDir()
+{
+#if defined(_WIN32)
+    DWORD size;
+    char* buf;
+
+    // Returns size including space for null terminator
+    size = GetCurrentDirectoryA(0, NULL);
+    buf = (char*)malloc(size);
+    GetCurrentDirectoryA(size, buf);
+    return buf;
+#else
+    size_t size = initialPathBufSize;
+    char* buf;
+
+    buf = (char*)malloc(size);
+    while (!getcwd(buf, size))
+    {
+        size *= 2;
+        buf = (char*)realloc(buf, size);
+    }
+    buf = (char*)realloc(buf, strlen(buf) + 1);
+    return buf;
+#endif
+}
+
+#if defined(NVTXW3_TEST_PATH_UTILITIES)
+/* Take pointer to string buffer of possibly-relative path, and returns
+*  equivalent absolute path.  Input path must not be empty.
+*  Returns pointer to heap-allocated string, must be freed with free(). */
+static char* AbsolutePath(const char* path)
+{
+#if defined(_WIN32)
+    size_t size;
+    char* buf;
+
+    if (!path) return NULL;
+
+    // Returns size including space for null terminator
+    size = (size_t)GetFullPathNameA(path, 0, NULL, NULL);
+    buf = (char*)malloc(size);
+    GetFullPathNameA(path, size, buf, NULL);
+    return buf;
+#else
+    if (!path) return NULL;
+
+    return path[0] == pathSep
+        ? MakeHeapString(path) // Absolute already
+        : AppendToPathHeapString(GetCurrentWorkingDir(), path);
+#endif
+}
+#endif
+
+/* Take pointer to heap string of path, and modifies it in-place to be its
+*  parent directory, i.e. the directory containing the input file/directory.
+*  String is shortened, but not reallocated, permitting possibly faster
+*  appending of different path later.  Returns the pointer passed in without
+*  modifying it for convenient chaining of path functions.  If input path is
+*  NULL, NULL is returned.  If input is an empty string, or root directory,
+*  the heap string will be set to an empty string to indicate there is no
+*  parent directory.  Returned pointer to heap-allocated string must be
+*  freed with free(). */
+static char* ToParentDir(char* path)
+{
+    char* cur;
+
+    if (!path) return NULL;
+
+    StripTrailingSlashes(path);
+
+    for (cur = path + strlen(path); cur >= path; --cur)
+    {
+        if (*cur == pathSep)
+        {
+            /* Found the last slash */
+            if (cur == path)
+            {
+                /* Special case -- last slash is first character
+                *  in buffer.  Trailing slashes were trimmed first,
+                *  so this can only occur when ParentDir should
+                *  return the root directory.  This is the only
+                *  case where we want to keep the slash we found,
+                *  so write the null terminator after the slash. */
+                *(cur + 1) = '\0';
+            }
+            else
+            {
+                /* Change slash to null, terminating the string
+                *  before the last slash */
+                *cur = '\0';
+            }
+            return path;
+        };
+    }
+
+    /* No slashes found, so there's no parent directory.  Assign empty
+    *  string by nulling first character, which is safe because all heap
+    *  strings must be at least one byte long. */
+    path[0] = '\0';
+    return path;
+}
+
+#if defined(NVTXW3_TEST_PATH_UTILITIES)
+/* Take pointer to string buffer of path, and returns the parent directory,
+*  i.e. the directory containing the input file/directory.  If input path is
+*  NULL, empty string, or root directory, NULL is returned to indicate there
+*  is no parent directory, so return value must be NULL-checked.
+*  Returns pointer to heap-allocated string, must be freed with free(). */
+static char* ParentDir(const char* path)
+{
+    char* buf;
+
+    if (!path) return NULL;
+
+    buf = ToParentDir(MakeHeapString(path));
+
+    if (strlen(buf) == 0)
+    {
+        /* No slashes found, so there's no parent directory */
+        free(buf);
+        return NULL;
+    }
+    else
+    {
+        return buf;
+    }
+}
+
+static int PathExists(const char* path)
+{
+#if defined(_WIN32)
+    DWORD result = GetFileAttributesA(path);
+    return result != INVALID_FILE_ATTRIBUTES;
+#else
+    int result = access(path, F_OK);
+    return result != -1;
+#endif
+}
+#endif
+
+/* Return a heap string containing the full path of the current process's
+*  executable file.  Buffer allocated may be a little larger than the path
+*  string it contains, and is not realloc'ed to fit since typical usage of
+*  this function involves getting the parent directory and appending to it.
+*  Returned pointer to heap-allocated string must be freed with free(). */
+static char* GetCurrentProcessPath()
+{
+    char* buf;
+#if defined(_WIN32)
+    {
+        DWORD size = initialPathBufSize;
+        DWORD newSize;
+        buf = NULL;
+        while (1)
+        {
+            buf = (char*)realloc(buf, size);
+            newSize = GetModuleFileNameA(NULL, buf, size);
+            if (newSize < size) break;
+            size *= 2;
+        }
+    }
+#elif defined(__APPLE__)
+    {
+        size_t size = PROC_PIDPATHINFO_MAXSIZE;
+        pid_t pid;
+        buf = (char*)malloc(size);
+        pid = getpid();
+        size = proc_pidpath(pid, buf, size);
+        if (size == 0)
+        {
+            buf[0] = '\0';
+        }
+    }
+#elif defined(__QNX__)
+    {
+        size_t size = fpathconf(0, _PC_MAX_INPUT);
+        if (size <= 0)
+        {
+            size = 4096;
+        }
+        ++size;
+        buf = (char*)malloc(size);
+        _cmdname(buf);
+    }
+#else
+    {
+        size_t size = initialPathBufSize;
+        ssize_t bytesReadSigned;
+        size_t bytesRead;
+        const char* linkName = "/proc/self/exe";
+        buf = NULL;
+        while (1)
+        {
+            buf = (char*)realloc(buf, size);
+            bytesReadSigned = readlink(linkName, buf, size);
+            if (bytesReadSigned < 0) { free(buf); return NULL; }
+            bytesRead = (size_t)bytesReadSigned;
+            if (bytesRead < size) break;
+            size *= 2;
+        }
+        buf[bytesRead] = '\0';
+    }
+#endif
+    return buf;
+}
+
+static char* GetCurrentProcessDir()
+{
+    return ToParentDir(GetCurrentProcessPath());
+}
+
+static int KVPConsumerForSimplify(
+    void* state,
+    const char* readKeyBegin,
+    const char* readKeyEnd,
+    const char* readValBegin,
+    const char* readValEnd)
+{
+    char* curWrite = *(char**)state;
+    size_t size;
+    /* Safe to cast away const here, since we are pointing at a non-const heap string */
+    char* keyBegin = (char*)readKeyBegin;
+    char* keyEnd   = (char*)readKeyEnd;
+    char* valBegin = (char*)readValBegin;
+    char* valEnd   = (char*)readValEnd;
+
+    /* Rebuild the simplified config line at the write pointer, using memmove since the
+    *  ranges may overlap or even be the exact same range. */
+    size = keyEnd - keyBegin;
+    memmove(curWrite, keyBegin, size);
+    curWrite += size;
+
+    *curWrite = '=';
+    ++curWrite;
+
+    size = valEnd - valBegin;
+    memmove(curWrite, valBegin, size);
+    curWrite += size;
+
+    *curWrite = '\n';
+    ++curWrite;
+
+    *(char**)state = curWrite;
+
+    return 0;
+}
+
+static char* SimplifyConfigHeapString(char* config)
+{
+    char* curWrite = config;
+
+    nvtxwConsumeConfigString(config, KVPConsumerForSimplify, &curWrite);
+
+    *curWrite = '\0';
+    return (char*)realloc(config, strlen(config) + 1);
+}
+
+typedef struct GetInitModeState_t
+{
+    int modeFound;
+    int modeStringFound;
+    int mode;
+    char* modeString;
+} GetInitModeState_t;
+
+static int KVPConsumerForGetInitMode(
+    void* statePtr,
+    const char* keyBegin,
+    const char* keyEnd,
+    const char* valBegin,
+    const char* valEnd)
+{
+    GetInitModeState_t* state = (GetInitModeState_t*)statePtr;
+    const char* const keyMode = "InitMode";
+    const char* const keyModeString = "InitModeString";
+    const size_t keyModeLen = strlen(keyMode);
+    const size_t keyModeStringLen = strlen(keyModeString);
+    size_t keyLen;
+
+    keyLen = keyEnd - keyBegin;
+
+    if (!state->modeFound
+        && keyLen == keyModeLen
+        && strncmp(keyBegin, keyMode, keyLen) == 0)
+    {
+        int mode;
+        char* val;
+        val = MakeHeapStringFromRange(valBegin, valEnd);
+        mode = atoi(val);
+        free(val);
+        state->mode = mode;
+        state->modeFound = 1;
+    }
+
+    if (!state->modeStringFound
+        && keyLen == keyModeStringLen
+        && strncmp(keyBegin, keyModeString, keyLen) == 0)
+    {
+        char* val;
+        val = MakeHeapStringFromRange(valBegin, valEnd);
+        state->modeString = val;
+        state->modeStringFound = 1;
+    }
+
+    return state->modeFound &&
+        (state->mode == NVTXW3_INIT_MODE_SEARCH_DEFAULT || state->modeStringFound);
+}
+
+/* Returns zero for success, and writes out params mode and modeString (the latter
+*  is a HeapString).  If mode is not detected, or if the mode requires a modeString
+*  and modeString is not detected, return non-zero error code. */
+static int GetInitModeFromConfig(const char* config, int* mode, char** modeString)
+{
+    GetInitModeState_t state = {0};
+
+    if (!mode || !modeString) return 1;
+    *mode = 0;
+    *modeString = NULL;
+
+    nvtxwConsumeConfigString(config, KVPConsumerForGetInitMode, &state);
+
+    /* Always an error if mode not found */
+    if (!state.modeFound)
+    {
+        free(state.modeString);
+        return 1;
+    }
+
+    /* Except in default mode, it's an error if modeString not found */
+    if (state.mode != NVTXW3_INIT_MODE_SEARCH_DEFAULT && !state.modeStringFound)
+    {
+        return 2;
+    }
+
+    *mode = state.mode;
+    *modeString = state.modeString;
+    return 0;
+}
+
+/*-------------------------------------------------------------*/
+/* Backend loader helpers */
+
+static nvtxwResultCode_t InitLibraryFilename(
+    const char* filename,                  /* required */
+    const char* configString,              /* optional */
+    nvtxwGetInterface_t* getInterfaceFunc, /* already null-checked */
+    void** moduleHandle)                   /* optional */
+{
+    /* modeString is the filename of the library to load */
+    NVTXW3_DLLHANDLE hModule;
+    nvtxwLoadImplementation_t pfnLoadImplementation;
+    nvtxwGetInterface_t tempGetInterfaceFunc = NULL;
+    nvtxwResultCode_t result;
+    char* configSimple = NULL;
+
+    *getInterfaceFunc = NULL;
+    if (moduleHandle) *moduleHandle = NULL;
+
+    if (!filename)
+    {
+        return NVTXW3_RESULT_INVALID_ARGUMENT;
+    }
+
+    hModule = NVTXW3_DLLOPEN(filename);
+    if (!hModule)
+    {
+        return NVTXW3_RESULT_LIBRARY_NOT_FOUND;
+    }
+
+    pfnLoadImplementation = (nvtxwLoadImplementation_t)NVTXW3_DLLFUNC(hModule, "nvtxwLoadImplementation");
+    if (!pfnLoadImplementation)
+    {
+        NVTXW3_DLLCLOSE(hModule);
+        return NVTXW3_RESULT_LOADER_SYMBOL_MISSING;
+    }
+
+    if (configString)
+    {
+        configSimple = SimplifyConfigHeapString(MakeHeapString(configString));
+    }
+
+    result = pfnLoadImplementation(configSimple, &tempGetInterfaceFunc);
+    free(configSimple);
+    if (result != NVTXW3_RESULT_SUCCESS || !tempGetInterfaceFunc)
+    {
+        NVTXW3_DLLCLOSE(hModule);
+        return result;
+    }
+
+    /* Success - now write to output params */
+    *getInterfaceFunc = tempGetInterfaceFunc;
+    if (moduleHandle)
+    {
+        void* mod = (void*)hModule;
+        *moduleHandle = mod;
+    }
+
+    return NVTXW3_RESULT_SUCCESS;
+}
+
+static nvtxwResultCode_t InitSearchDefault(
+    const char* configString,              /* optional */
+    nvtxwGetInterface_t* getInterfaceFunc, /* already null-checked */
+    void** moduleHandle)                   /* optional */
+{
+    nvtxwResultCode_t result;
+    char* filename;
+
+    /* 1. Directory of current process's executable */
+    filename = AppendToPathHeapString(GetCurrentProcessDir(), NVTXW3_LIB_FILENAME_DEFAULT);
+    result = InitLibraryFilename(
+        filename, configString, getInterfaceFunc, moduleHandle);
+    free(filename);
+    if (result == NVTXW3_RESULT_SUCCESS)
+    {
+        return NVTXW3_RESULT_SUCCESS;
+    }
+
+    /* 2. Standard search paths for dynamic libraries */
+    result = InitLibraryFilename(
+        NVTXW3_LIB_FILENAME_DEFAULT, configString, getInterfaceFunc, moduleHandle);
+    if (result == NVTXW3_RESULT_SUCCESS)
+    {
+        return NVTXW3_RESULT_SUCCESS;
+    }
+
+    /* 3. Current working directory (may not be included in standard search paths) */
+    filename = AppendToPathHeapString(GetCurrentWorkingDir(), NVTXW3_LIB_FILENAME_DEFAULT);
+    result = InitLibraryFilename(
+        filename, configString, getInterfaceFunc, moduleHandle);
+    free(filename);
+
+    /* No usable backend found */
+    return NVTXW3_RESULT_LIBRARY_NOT_FOUND;
+}
+
+static nvtxwResultCode_t InitLibraryDirectory(
+    const char* directory,                 /* required */
+    const char* configString,              /* optional */
+    nvtxwGetInterface_t* getInterfaceFunc, /* already null-checked */
+    void** moduleHandle)                   /* optional */
+{
+    nvtxwResultCode_t result;
+    char* filename;
+
+    if (!directory) return NVTXW3_RESULT_INVALID_ARGUMENT;
+
+    filename = AppendToPathHeapString(
+        MakeHeapString(directory), NVTXW3_LIB_FILENAME_DEFAULT);
+
+    result = InitLibraryFilename(filename, configString, getInterfaceFunc, moduleHandle);
+    free(filename);
+
+    return result;
+}
+
+static nvtxwResultCode_t InitConfigString(
+    const char* config,
+    nvtxwGetInterface_t* getInterfaceFunc,
+    void** moduleHandle)
+{
+    nvtxwResultCode_t result;
+    int err;
+    int mode = 0;
+    char* modeString = NULL;
+
+    if (!config) return NVTXW3_RESULT_INVALID_ARGUMENT;
+
+    err = GetInitModeFromConfig(config, &mode, &modeString);
+    if (err)
+    {
+        free(modeString);
+        return NVTXW3_RESULT_CONFIG_MISSING_LOADER_INFO;
+    }
+
+    switch (mode)
+    {
+        case NVTXW3_INIT_MODE_SEARCH_DEFAULT   : result = InitSearchDefault   (            config, getInterfaceFunc, moduleHandle); break;
+        case NVTXW3_INIT_MODE_LIBRARY_FILENAME : result = InitLibraryFilename (modeString, config, getInterfaceFunc, moduleHandle); break;
+        case NVTXW3_INIT_MODE_LIBRARY_DIRECTORY: result = InitLibraryDirectory(modeString, config, getInterfaceFunc, moduleHandle); break;
+        default: result = NVTXW3_RESULT_UNSUPPORTED_LOADER_MODE;
+    }
+
+    free(modeString);
+    return result;
+}
+
+static nvtxwResultCode_t InitConfigEnvVar(
+    const char* configEnvVarName,
+    nvtxwGetInterface_t* getInterfaceFunc,
+    void** moduleHandle)
+{
+    const char* config;
+
+    if (!configEnvVarName) return NVTXW3_RESULT_INVALID_ARGUMENT;
+
+    config = getenv(configEnvVarName);
+    if (!config) return NVTXW3_RESULT_ENV_VAR_NOT_FOUND;
+
+    return InitConfigString(config, getInterfaceFunc, moduleHandle);
+}
+
+static nvtxwResultCode_t InitConfigFilename(
+    const char* configFilename,
+    nvtxwGetInterface_t* getInterfaceFunc,
+    void** moduleHandle)
+{
+    nvtxwResultCode_t result;
+    char* config;
+
+    if (!configFilename) return NVTXW3_RESULT_INVALID_ARGUMENT;
+
+    config = LoadFileIntoHeapString(configFilename);
+    if (!config) return NVTXW3_RESULT_CONFIG_NOT_FOUND;
+
+    result = InitConfigString(config, getInterfaceFunc, moduleHandle);
+    free(config);
+    return result;
+}
+
+static nvtxwResultCode_t InitConfigDirectory(
+    const char* configDirectory,
+    nvtxwGetInterface_t* getInterfaceFunc,
+    void** moduleHandle)
+{
+    nvtxwResultCode_t result;
+    char* configFilename;
+
+    if (!configDirectory) return NVTXW3_RESULT_INVALID_ARGUMENT;
+
+    configFilename = AppendToPathHeapString(
+        MakeHeapString(configDirectory), NVTXW3_CONFIG_FILENAME_DEFAULT);
+
+    result = InitConfigFilename(configFilename, getInterfaceFunc, moduleHandle);
+    free(configFilename);
+    return result;
+}
+
+/* #define NVTXW3_TEST_PATH_UTILITIES */
+#if defined(NVTXW3_TEST_PATH_UTILITIES)
+#include <test_path_utilities.h>
+#endif
+
+NVTXW3_DECLSPEC nvtxwResultCode_t nvtxwInitialize(
+    nvtxwInitMode_t mode,
+    const char* modeString,
+    nvtxwGetInterface_t* getInterfaceFunc,
+    void** moduleHandle)
+{
+#if defined(NVTXW3_TEST_PATH_UTILITIES)
+    TestPathUtilities();
+#endif
+
+    if (!getInterfaceFunc)
+    {
+        return NVTXW3_RESULT_INVALID_ARGUMENT;
+    }
+
+    switch (mode)
+    {
+        case NVTXW3_INIT_MODE_SEARCH_DEFAULT   : return InitSearchDefault   (            NULL, getInterfaceFunc, moduleHandle);
+        case NVTXW3_INIT_MODE_LIBRARY_FILENAME : return InitLibraryFilename (modeString, NULL, getInterfaceFunc, moduleHandle);
+        case NVTXW3_INIT_MODE_LIBRARY_DIRECTORY: return InitLibraryDirectory(modeString, NULL, getInterfaceFunc, moduleHandle);
+        case NVTXW3_INIT_MODE_CONFIG_FILENAME  : return InitConfigFilename  (modeString,       getInterfaceFunc, moduleHandle);
+        case NVTXW3_INIT_MODE_CONFIG_DIRECTORY : return InitConfigDirectory (modeString,       getInterfaceFunc, moduleHandle);
+        case NVTXW3_INIT_MODE_CONFIG_STRING    : return InitConfigString    (modeString,       getInterfaceFunc, moduleHandle);
+        case NVTXW3_INIT_MODE_CONFIG_ENV_VAR   : return InitConfigEnvVar    (modeString,       getInterfaceFunc, moduleHandle);
+    }
+
+    return NVTXW3_RESULT_INVALID_INIT_MODE;
+}
+
+NVTXW3_DECLSPEC void nvtxwUnload(
+    void* moduleHandle)
+{
+    nvtxwUnloadImplementation_t pfnUnload;
+    NVTXW3_DLLHANDLE hModule = (NVTXW3_DLLHANDLE)moduleHandle;
+
+    if (!hModule) return;
+
+    pfnUnload = (nvtxwUnloadImplementation_t)NVTXW3_DLLFUNC(hModule, "nvtxwUnloadImplementation");
+    if (pfnUnload)
+    {
+        pfnUnload();
+    }
+
+    NVTXW3_DLLCLOSE(hModule);
+}
diff --git a/src/main/cpp/profiler/nvtxw3.h b/src/main/cpp/profiler/nvtxw3.h
new file mode 100644
index 0000000000..d8dc40aa0f
--- /dev/null
+++ b/src/main/cpp/profiler/nvtxw3.h
@@ -0,0 +1,549 @@
+/*
+ *  Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  Licensed under the Apache License v2.0 with LLVM Exceptions.
+ *  See https://llvm.org/LICENSE.txt for license information.
+ *
+ *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#if !defined(NVTXW3_API)
+#define NVTXW3_API
+
+#include <nvtx3/nvToolsExtPayload.h>
+
+#include <string.h> /* For nvtxwConsumeConfigString inline implementation */
+
+#ifdef __cplusplus
+#define NVTXW3_DECLSPEC extern "C"
+#else
+#define NVTXW3_DECLSPEC extern
+#endif
+
+typedef int32_t nvtxwResultCode_t;
+
+#define NVTXW3_RESULT_SUCCESS                     0
+#define NVTXW3_RESULT_FAILED                      1
+#define NVTXW3_RESULT_INVALID_ARGUMENT            2
+#define NVTXW3_RESULT_INVALID_INIT_MODE           3
+#define NVTXW3_RESULT_LIBRARY_NOT_FOUND           4
+#define NVTXW3_RESULT_CONFIG_NOT_FOUND            5
+#define NVTXW3_RESULT_LOADER_SYMBOL_MISSING       6
+#define NVTXW3_RESULT_LOADER_FAILED               7
+#define NVTXW3_RESULT_INTERFACE_ID_NOT_SUPPORTED  8
+#define NVTXW3_RESULT_CONFIG_MISSING_LOADER_INFO  9
+#define NVTXW3_RESULT_UNSUPPORTED_LOADER_MODE    10
+#define NVTXW3_RESULT_ENV_VAR_NOT_FOUND          11
+
+
+#if defined(_WIN32)
+#define NVTXW3_LIB_PREFIX ""
+#define NVTXW3_LIB_SUFFIX ".dll"
+#else
+#define NVTXW3_LIB_PREFIX "lib"
+#define NVTXW3_LIB_SUFFIX ".so"
+#endif
+
+/* Name of backend library file to use with init mode LIBRARY_DIRECTORY.
+*  Note the platform-dependent prefix and suffix above are added here. */
+#define NVTXW3_LIB_FILENAME_DEFAULT NVTXW3_LIB_PREFIX "nvtxw3" NVTXW3_LIB_SUFFIX
+
+/* Name of config library file to use with init mode CONFIG_DIRECTORY.
+*  Note the platform-dependent prefix and suffix above are added here. */
+#define NVTXW3_CONFIG_FILENAME_DEFAULT "nvtxw3.ini"
+
+/* Init modes:  nvtxwInitialize takes nvtxwInitMode_t mode, one of the #defines
+*  below, and a modeString, whose meaning is dependent on the mode.  These modes
+*  provide a variety of ways to find the NVTXW backend implementation library. */
+typedef int32_t nvtxwInitMode_t;
+
+/* Default search mode is to look for library with default filename, as defined
+*  by NVTXW3_LIB_FILENAME_DEFAULT, in the following order:
+*    1. Directory of current process's executable
+*    2. Standard search paths for dynamic libraries
+*    3. Current working directory (may not be included in standard search paths)
+*  The modeString argument is ignored. */
+#define NVTXW3_INIT_MODE_SEARCH_DEFAULT           0
+
+/* The modeString argument is interpreted as a filename or pathname to the
+*  backend library.  The string is passed directly to the platform function
+*  for loading dynamic libraries (dlopen/LoadLibrary), so that function's
+*  behavior will apply.  In general, a filename with no path will try the
+*  standard search paths, and an absolute path will be used verbatim. */
+#define NVTXW3_INIT_MODE_LIBRARY_FILENAME         1
+
+/* The modeString argument is interpreted as a directory in which to search
+*  for the backend library, whose filename is defined by the macro
+*  NVTXW3_LIB_FILENAME_DEFAULT. */
+#define NVTXW3_INIT_MODE_LIBRARY_DIRECTORY        2
+
+/* The modeString argument is interpreted as a filename or pathname to a
+*  config file, which will be used to find the backend library.  If the
+*  filename is not an absolute path, it will be interpreted as relative
+*  to the current working direcrtory.  See below for config file format. */
+#define NVTXW3_INIT_MODE_CONFIG_FILENAME          3
+
+/* The modeString argument is interpreted as a directory in which to search
+*  for a config file, which will be used to find the backend library.  The
+*  name of the config file is defined by NVTXW3_CONFIG_FILENAME_DEFAULT.
+   See below for config file format. */
+#define NVTXW3_INIT_MODE_CONFIG_DIRECTORY         4
+
+/* The modeString argument is interpreted as the config string itself.
+*  See below for config string format. */
+#define NVTXW3_INIT_MODE_CONFIG_STRING            5
+
+/* The modeString argument is interpreted as the name of an environment
+*  variable that contains the config string.  See below for config string
+*  format. */
+#define NVTXW3_INIT_MODE_CONFIG_ENV_VAR           6
+
+/* Config format (for both files and flat config strings):
+*
+*  The format is key=value pairs, delimited by new-line characters or
+*  | (pipe) characters.  Values are prohibited from containing those
+*  characters.  If an entry begins with #, the entry (up to the next
+*  new-line or pipe) is discarded as a comment.
+*
+*  When the config string is provided to the SessionBegin function
+*  as an argument, it is preprocessed to remove comments, blank lines,
+*  and to convert all entry delimiters to a single \n (line feed).
+*  This allows the tool to have a simpler config parser, and to print
+*  the config in a readable format.
+*
+*  If a config specifies the same key multiple times, only the first
+*  appearance should be honored, and the subsequent appearances should
+*  be ignored.  This allows a simple scan for a particular key to loop
+*  from the beginning until the first occurrence is found, and not have
+*  to loop through the rest for repeats.  Note that this means building
+*  a map from keys to values should not overwrite existing values if a
+*  found key already exists in the map.  This guarantee allows adding
+*  extra key/value pairs to a config string by prepending (to override
+*  existing keys) or appending (to set values only if they weren't set
+*  already).
+*
+*  Keys are tool-specific, but the loader supports two keys:
+*
+*  - InitMode=n
+*      Just like the argument to nvtxwInitialize, this allows the user
+*      to specify how to find the backend library, using one of the
+*      numeric values of the NVTXW3_INIT_MODE_ constants.  Currently,
+*      only values 0-2 are supported for init modes specified within
+*      a config file/string.
+*
+*  - InitModeString=string
+*      Just like the argument to nvtxwInitialize, this allows the user
+*      to specify a mode-specific string for how to find the backend
+*      library.  This key is ignored for mode 0 (SEARCH_DEFAULT), but
+*      required for other modes.  Currently, only mode values 0-2 are
+*      supported for init modes specified within a config file/string.
+*/
+
+/*--------- Helpers for consuming config strings ----------------*/
+
+/* Typedef of function pointer for callback to use with nvtxwConsumeConfigString.
+*  The state pointer can be used for anything -- nvtxwConsumeConfigString passes
+*  it directly to the callback.  The begin/end pointers for the key and value are
+*  pointing to ranges within the input config string.  If the input config string
+*  is known to be non-const, this callback can safely cast away const and write
+*  to these pointers, for example when simplifying an input config string.  To
+*  check if a key name is a particular string, use:
+*     strncmp("ExampleKeyName", keyBegin, keyEnd - keyBegin) == 0
+*  In C++, you can construct a string using std::string(keyBegin, keyEnd).
+*  Return zero to continue consuming key/value pairs, or non-zero to stop. */
+typedef int (*nvtxwKeyValuePairConsumer_t)(
+    void* state,
+    const char* keyBegin,
+    const char* keyEnd,
+    const char* valBegin,
+    const char* valEnd);
+
+/* Parse config and call the consumer callback (see typedef above) on each
+*  valid key/value pair found in the config.  Inline implementation provided
+*  here so backend implementations of NVTXW can use this function without
+*  having to include nvtxw3.c in their build.  Users of the NVTXW API may
+*  also find it useful to parse/modify a config before passing it to NVTXW. */
+NVTX_LINKONCE_DEFINE_FUNCTION
+void nvtxwConsumeConfigString(const char* config, nvtxwKeyValuePairConsumer_t consumer, void* state)
+{
+    const char* curRead = config;
+    const char* const lineBreak = "|\n\r";
+    const char* const whitespace = " \t\v"; /* Not including lineBreak characters */
+    int consumerStopRequested = 0;
+
+    if (!config || !consumer) return;
+
+    while (*curRead && !consumerStopRequested)
+    {
+        const char* lineBegin;
+        const char* lineEnd;
+        const char* keyBegin;
+        const char* keyEnd;
+        const char* valBegin;
+        const char* valEnd;
+
+        /* Read a line, trimming leading whitespace - get pointers to begin/end */
+        lineBegin = curRead + strspn(curRead, whitespace);
+        lineEnd = lineBegin + strcspn(lineBegin, lineBreak);
+
+        /* Set read pointer to beginning of next line, so we can continue any time */
+        curRead = lineEnd + strspn(lineEnd, lineBreak);
+
+        /* Ignore line if it's only whitespace */
+        if (lineBegin == lineEnd) continue;
+        /* Ignore line if it's is a comment */
+        if (*lineBegin == '#') continue;
+
+        /* Determine if line has a key and value delimited by '=' */
+        keyBegin = lineBegin;
+        keyEnd = keyBegin;
+        while (keyEnd < lineEnd && *keyEnd != '=') ++keyEnd;
+
+        /* Ignore line if there's no '=' in the line */
+        if (keyEnd == lineEnd) continue;
+        /* Ignore line if there's no key name before '=' */
+        if (keyEnd == keyBegin) continue;
+
+        /* keyEnd now points at '=' after the key */
+        valBegin = keyEnd + 1;
+        valBegin += strspn(valBegin, whitespace);
+
+        /* Ignore line if all characters after '=' are whitespace  */
+        if (valBegin == lineEnd) continue;
+
+        valEnd = lineEnd;
+
+        /* Got begin/end pointers for key and value.  We know there are non-whitespace
+        *  characters in both of them, and their leading whitespace was already trimmed.
+        *  Now trim their trailing whitespace. */
+        while (strchr(whitespace, *(keyEnd - 1))) --keyEnd;
+        while (strchr(whitespace, *(valEnd - 1))) --valEnd;
+
+        /* Now key and value begin/end pointers can be passed to the consumer */
+        consumerStopRequested = consumer(state, keyBegin, keyEnd, valBegin, valEnd);
+    }
+}
+
+/*--------- Initialization interface ---------*/
+
+typedef int32_t nvtxwInterfaceId_t;
+
+typedef nvtxwResultCode_t (*nvtxwGetInterface_t)(
+    nvtxwInterfaceId_t interfaceId,
+    const void** iface);
+
+/* Initialize the NVTXW library by providing information on how to
+*  load the backend library that implements the NVTXW API.  `mode` must
+*  be one of the NVTXW3_INIT_MODE_ constants.  `modeString` is required
+*  for all modes besides 0 (SEARCH_DEFAULT), and has mode-specific
+*  interpretation.  See comments for the mode constants.  Backend library
+*  must provide an exported function symbol "nvtxwLoadImplementation",
+*  which must return NVTXW3_RESULT_SUCCESS and provide a pointer to its
+*  GetInterface function for initialization to be considered successful.
+*  Modes that search multiple locations will continue searching after an
+*  unsuccessful attempt to initialize a library.
+*  `getInterfaceFunc` is an out-param that must be non-null to receive
+*  a pointer to the backend's GetInterface function, which is used to
+*  make version-safe calls into the backend library.
+*  `moduleHandle` is an out-param that can be null.  If non-null, it
+*  receives the platform-specific module handle of the loaded backend
+*  library when NVTXW3_RESULT_SUCCESS is returned.  This can be passed
+*  to nvtxwUnload to unload the backend library. */
+NVTXW3_DECLSPEC nvtxwResultCode_t nvtxwInitialize(
+    nvtxwInitMode_t mode,
+    const char* modeString,
+    nvtxwGetInterface_t* getInterfaceFunc,
+    void** moduleHandle);
+
+/* A backend library may optionally provide an exported function symbol
+*  "nvtxwUnloadImplementation".  If it does, nvtxwUnload will call this
+*  function before closing the module handle.  This gives the backend a
+*  chance to free any memory tracked in global variables before it gets
+*  unloaded.  Attempting to unload the backend is not necessary and not
+*  even recommended in common cases -- it is included to ensure clients
+*  of the NVTXW API have a way to cleanly pass a memory checker. */
+NVTXW3_DECLSPEC void nvtxwUnload(
+    void* moduleHandle);
+
+/*----- Typedefs for function pointers backend implements -----*/
+
+typedef nvtxwResultCode_t (*nvtxwLoadImplementation_t)(
+    const char* configString,
+    nvtxwGetInterface_t* getInterfaceFunc);
+
+typedef void (*nvtxwUnloadImplementation_t)();
+
+/*--------- Interface IDs ----------------*/
+
+#define NVTXW3_INTERFACE_ID_CORE_V1      2
+
+/*--------- INTERFACE_ID_CORE_V1 ---------*/
+
+typedef struct nvtxwSessionHandle_t
+{
+    void* opaque;
+} nvtxwSessionHandle_t;
+
+typedef struct nvtxwStreamHandle_t
+{
+    void* opaque;
+} nvtxwStreamHandle_t;
+
+/* Growable struct of arguments for SessionBegin */
+typedef struct nvtxwSessionAttributes_v1
+{
+    /* Guaranteed to increase when new members are added at the end */
+    size_t struct_size;
+
+    /* Provide a name for the session.
+    *  Tools may display this name, or use it to name a file or directory
+    *  representing the session. */
+    const char* name;
+
+    /* String containing configuration options for the session.
+    *  Format is key=value, one per line, delimited by \n (line feed).
+    *  Key names must not contain an = (equals sign), and values may
+    *  contain any character except \r (carriage return), \n (line feed),
+    *  or | (pipe).  Tools shall use reasonable defaults for any config
+    *  options not provided, and ignore any keys they do not support.
+    *  See above for explanation of how config strings are provided.
+    *  See tool-specific documentation for lists of supported keys. */
+    const char* configString;
+} nvtxwSessionAttributes_t;
+
+/* Define whether event ordering in a stream is based on event scope */
+
+/* Event ordering is defined at the stream level, independent of
+*  event scopes within the stream. */
+#define NVTXW3_STREAM_ORDER_INTERLEAVING_NONE          (int16_t)0
+
+/* Event ordering is defined at the event scope level.  This means
+*  ordering guarantees described by the other fields only apply to
+*  events of the same scope within the stream.  The order of events
+*  in different scopes is unspecified. */
+#define NVTXW3_STREAM_ORDER_INTERLEAVING_EVENT_SCOPE   (int16_t)1
+
+
+/* Define how events are fully or partially sorted in a stream. */
+
+/* No guarantees can be made about event ordering in the stream.
+*  Events may need to be sorted by the tool. */
+#define NVTXW3_STREAM_ORDERING_TYPE_UNKNOWN            (int16_t)0
+
+/* All events represent single points in time and are fully or
+*  partially sorted in the order in which they occurred. */
+#define NVTXW3_STREAM_ORDERING_TYPE_STRICT             (int16_t)1
+
+/* Events that represent single points in time are fully or
+*  partially sorted in the order in which they occurred, and
+*  events representing time ranges in order of begin time. */
+#define NVTXW3_STREAM_ORDERING_TYPE_PACKED_RANGE_START (int16_t)2
+
+/* Events that represent single points in time are fully or
+*  partially sorted in the order in which they occurred, and
+*  events representing time ranges in order of end time. */
+#define NVTXW3_STREAM_ORDERING_TYPE_PACKED_RANGE_END   (int16_t)3
+
+/* Define how to quantify skid when events are partially sorted.  Only considered
+*  when orderingType is not UNKNOWN.  Which events in the stream this applies to
+*  depends on the value of orderInterleaving.  Which timestamp is used for ordering
+*  in an event with multiple timestamps depends on the value of orderingType. */
+
+/* Events are fully sorted. */
+#define NVTXW3_STREAM_ORDERING_SKID_NONE          0
+
+/* Events are partially sorted.  The orderingSkidAmount field defines "skid" as
+*  a number of nanoseconds.  For any two events A and B in the stream or scope
+*  (depending on interleaving level), where A is written into the stream before
+*  B, the tool must handle the case where B has a lower timestamp than A, but
+*  can assume B's timestamp cannot be more than the "skid" number of nanoseconds
+*  earlier than A's timestamp.  Note that timestamp values in events cannot be
+*  assumed to be in units of nanoseconds, so this value cannot be added directly
+*  timestamp values without conversion. */
+#define NVTXW3_STREAM_ORDERING_SKID_TIME_NS       1
+
+/* Events are partially sorted.  The orderingSkidAmount field defines "skid" as
+*  a number of events.  Regarding only events in a stream or scope (depending on
+*  interleaving level), for any event A, the next "skid" number of events after
+*  A may have a lower timestamp than A (by any amount of time), but no events
+*  written after that can have a lower timestamp than A. */
+
+/* Events are partially sorted.  No event in the stream is written
+*  more than the given number of events before any event written
+*  previously in the stream.  Note that
+*  timestamps in events may not be in units of nanoseconds. */
+#define NVTXW3_STREAM_ORDERING_SKID_EVENT_COUNT   2
+
+/* Growable struct of arguments for StreamOpen */
+typedef struct nvtxwStreamAttributes_v1
+{
+    /* Guaranteed to increase when new members are added at the end */
+    size_t struct_size;
+
+    /* Name of a stream, used for identification from other streams.
+    *  Tools typically will not display stream names.  No two streams
+    *  in the same session may have the same name. */
+    const char* name;
+
+    /* Name of NVTX domain to use implicitly for all events written into
+    *  this stream.  Since registered IDs are required to be unique within
+    *  a domain, all ID registration functions called on this stream must
+    *  not register the same ID value to mean different things.  Multiple
+    *  streams may use the same domain by specifying the same value for
+    *  this string, and the tool is expected to combine registrations from
+    *  these streams into a single set of registrations for the domain.
+    *  If two streams share a domain, and a registration is made in one
+    *  stream, the registered ID may be used immediately afterwards in the
+    *  other stream, provided the usage occurs on the same thread -- it is
+    *  implementation-defined whether or not this is supported if the usage
+    *  occurs on a different thread.  Tools are expected to combine data
+    *  from any domains registered with the same name, even between NVTXW
+    *  and NVTX, when merging data acquired from both APIs. */
+    const char* nvtxDomainName;
+
+    /* The default scope for all events in the stream that don't specify
+    *  any scope.  See comments below for nvtxwEventScopeAttributes_t.
+    *  Note that "nvtxwStream" without brackets may not be used as a node
+    *  name here -- this field is defining what that node name will mean
+    *  in scope registrations occurring later in this stream.  However,
+    *  "nvtxwStream[name]" referencing a different stream by its name
+    *  (see above) to use its default scope is supported, as long as that
+    *  stream was successfully opened (and may be already closed). */
+    const char* eventScopePath;
+
+    /* Information about event ordering inside the stream.  See comments
+    *  for #defines above. */
+    int16_t orderInterleaving;  /* NVTXW3_STREAM_ORDER_INTERLEAVING_*    */
+    int16_t orderingType;       /* NVTXW3_STREAM_ORDERING_TYPE_*         */
+    int32_t orderingSkid;       /* NVTXW3_STREAM_ORDERING_SKID_*         */
+    int64_t orderingSkidAmount; /* Numeric value, dependent on skid type */
+} nvtxwStreamAttributes_t;
+
+/* Growable struct of arguments for EventScopeRegister */
+typedef struct nvtxwEventScopeAttributes_v1
+{
+    /* Guaranteed to increase when new members are added at the end */
+    size_t struct_size;
+
+    /* Path delimited by / characters, relative to hierarchy root.
+    *  Nodes in the path may use name[key] syntax to indicate an
+    *  array of sibling nodes, which may be combined with other
+    *  non-array nodes or different arrays at the same scope.
+    *  Leading slashes are ignored.  Node names should be ASCII
+    *  printable characters, excluding the /, [, and ] characters,
+    *  which have special meaning here.  A set of reserved node
+    *  names with special properties is given in the documentation
+    *  for NVTX Deferred Events.  "nvtxwStream" is a reserved node
+    *  name that can be used as a path's root node, indicating the
+    *  path is relative to the eventScopePath set for the stream
+    *  in which the event scope is registered.  "nvtxwStream[name]"
+    *  refers to the eventScopePath of a stream in the session with
+    *  matching name.  Note that the NVTX domain is implicitly a
+    *  child node of the scope, since multiple domains can assign
+    *  events to the same scope, and tools should isolate events
+    *  from separate domains. */
+    const char* path;
+
+    /* Static event scope ID must be provided, unique within the domain,
+       >= NVTX_EVENT_SCOPE_ID_STATIC_START, and
+       <  NVTX_EVENT_SCOPE_ID_DYNAMIC_START */
+    uint64_t scopeId;
+} nvtxwEventScopeAttributes_t;
+
+/* nvtxwInterfaceCore_t is a growable struct of function pointers to
+*  the NVTX Writer (NVTXW) API.  Breaking changes will not be made to
+*  this interface without also changing the interface ID passed to
+*  nvtxwGetInterface_t, e.g. NVTXW3_INTERFACE_ID_CORE_V1.  Non-breaking
+*  are made by adding fields to the end of the struct, ensuring the
+*  value of 'struct_size' increases, so the presence of a member can
+*  be checked by comparing struct_size with that member's offset. */
+typedef struct nvtxwInterfaceCore_v1
+{
+    /* Guaranteed to increase when new members are added at the end */
+    size_t struct_size;
+
+    /* Create a session, which represents a collection of trace data
+    *  from one or more streams.  Takes a growable struct of session
+    *  attributes (see nvtxwSessionAttributes_t). */
+    nvtxwResultCode_t (*SessionBegin)(
+        nvtxwSessionHandle_t* session,
+        const nvtxwSessionAttributes_t* attr);
+
+    /* Notify the implementation that all trace data for the session
+    *  has been provided, and the session may be destroyed.  Depending
+    *  on configuration options, ending a session may trigger behavior
+    *  like writing an output file or opening a data viewer. */
+    nvtxwResultCode_t (*SessionEnd)(
+        nvtxwSessionHandle_t session);
+
+    /* Create a stream within a session.  A stream is the object events
+    *  are written to.  The NVTX domain and event scope are set when
+    *  creating a stream, allowing individual events to avoid repeating
+    *  these fields.  Since ID values for schemas, registered strings,
+    *  etc. are only unique within a domain, all registrations that
+    *  assign an ID are done within a stream, since the domain is fixed
+    *  inside a stream.  Other stream properties set at creation time
+    *  are a name string, and information about the way events in the
+    *  stream are ordered. */
+    nvtxwResultCode_t (*StreamOpen)(
+        nvtxwStreamHandle_t* stream,
+        nvtxwSessionHandle_t session,
+        const nvtxwStreamAttributes_t* attr);
+
+    /* Destroy the stream object.  This is not expected to trigger a
+    *  reaction in the implementation that no more events are coming;
+    *  only ending a session is intended to have that effect. */
+    nvtxwResultCode_t (*StreamClose)(
+        nvtxwStreamHandle_t stream);
+
+    /* Register a scope ID to represent a scope path, so the ID can be
+    *  used in events or schemas to efficiently indicate a scope.
+    *  Static event scope ID must be provided, unique within the domain,
+    *  >= NVTX_EVENT_SCOPE_ID_STATIC_START, and
+    *  <  NVTX_EVENT_SCOPE_ID_DYNAMIC_START */
+    nvtxwResultCode_t (*EventScopeRegister)(
+        nvtxwStreamHandle_t stream,
+        const nvtxwEventScopeAttributes_t* attr);
+
+    /* Register a schema ID to represent a schema, which describes the
+    *  binary layout of a payload.
+    *  Static schema ID must be provided, unique within the domain,
+    *  >= NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START, and
+    *  <  NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_DYNAMIC_START */
+    nvtxwResultCode_t (*SchemaRegister)(
+        nvtxwStreamHandle_t stream,
+        const nvtxPayloadSchemaAttr_t* attr);
+
+    /* Register a schema ID to represent an enum type, including the
+    *  mapping between its values and their name strings.
+    *  Static schema ID must be provided, unique within the domain,
+       >= NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START, and
+       <  NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_DYNAMIC_START */
+    nvtxwResultCode_t (*EnumRegister)(
+        nvtxwStreamHandle_t stream,
+        const nvtxPayloadEnumAttr_t* attr);
+
+    /* Write a batch of payloads into the stream representing one or more
+    *  events.  A logical event with multiple payloads cannot be broken up
+    *  across multiple calls to EventWrite.  The schema definitions for
+    *  the payloads dictate how they are interpreted as events. */
+    nvtxwResultCode_t (*EventWrite)(
+        nvtxwStreamHandle_t stream,
+        const nvtxPayloadData_t* payloads,
+        size_t payloadCount);
+
+} nvtxwInterfaceCore_t;
+
+#endif
diff --git a/src/main/cpp/profiler/nvtxw_events.h b/src/main/cpp/profiler/nvtxw_events.h
new file mode 100644
index 0000000000..3b46c1c989
--- /dev/null
+++ b/src/main/cpp/profiler/nvtxw_events.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include "nvtxw3.h"
+#include "NvtxwEvents.h"
+#include <string>
+
+extern bool createNvtxwStream(const nvtxwInterfaceCore_t *nvtxwInterface,
+  const nvtxwSessionHandle_t& session, 
+  const std::string & name,
+  const std::string & domain, 
+  nvtxwStreamHandle_t & stream);
+
+extern int initialize_nvtxw(std::ifstream& in, const std::string& outName, 
+  void *& nvtxwModuleHandle,
+  nvtxwInterfaceCore_t *&nvtxwInterface,
+  nvtxwSessionHandle_t &session,
+  nvtxwStreamHandle_t &stream);
diff --git a/src/main/cpp/profiler/spark_rapids_profile_converter.cpp b/src/main/cpp/profiler/spark_rapids_profile_converter.cpp
index b916020392..77f5a3b4aa 100644
--- a/src/main/cpp/profiler/spark_rapids_profile_converter.cpp
+++ b/src/main/cpp/profiler/spark_rapids_profile_converter.cpp
@@ -50,11 +50,12 @@ extern char const* Profiler_Schema;
 
 struct program_options {
   std::optional<std::filesystem::path> output_path;
-  bool help       = false;
-  bool json       = false;
-  bool nvtxt      = false;
-  int json_indent = 2;
-  bool version    = false;
+  bool help = false;
+  bool json = false;
+  bool nvtxt = false;
+  bool nvtxw = false;
+  int  json_indent = 2;
+  bool version = false;
 };
 
 struct event {
@@ -114,6 +115,7 @@ Converts the spark-rapids profile in profile.bin into other forms.
   -i, --json-indent=INDENT  indentation to use for JSON. 0 is no indent, less than 0 also removes newlines
   -o, --output=PATH         use PATH as the output filename
   -t. --nvtxt               convert to NVTXT, default output is stdout
+  -w. --nvtxw               generate nsys-rep using NVTXW API
   -V, --version             print the version number
   )" << std::endl;
 }
@@ -179,13 +181,32 @@ std::pair<program_options, std::vector<std::string_view>> parse_options(
         ++argp;
       }
     } else if (*argp == "-j" || *argp == "--json") {
-      if (opts.nvtxt) { throw std::runtime_error("JSON and NVTXT output are mutually exclusive"); }
+      if (opts.nvtxt) {
+        throw std::runtime_error("JSON and NVTXT output are mutually exclusive");
+      }
+      if (opts.nvtxw) {
+        throw std::runtime_error("JSON and NVTXW output are mutually exclusive");
+      }
       opts.json = true;
       ++argp;
     } else if (*argp == "-t" || *argp == "--nvtxt") {
-      if (opts.json) { throw std::runtime_error("JSON and NVTXT output are mutually exclusive"); }
+      if (opts.json) {
+        throw std::runtime_error("JSON and NVTXT output are mutually exclusive");
+      }
+      if (opts.nvtxw) {
+        throw std::runtime_error("NVTXT and NVTXW output are mutually exclusive");
+      }
       opts.nvtxt = true;
       ++argp;
+    } else if (*argp == "-w" || *argp == "--nvtxw") {
+      if (opts.json) {
+        throw std::runtime_error("JSON and NVTXW output are mutually exclusive");
+      }
+      if (opts.nvtxt) {
+        throw std::runtime_error("NVTXT and NVTXW output are mutually exclusive");
+      }
+      opts.nvtxw = true;
+      ++argp;
     } else if (*argp == "-V" || *argp == "--version") {
       opts.version = true;
       ++argp;
@@ -687,12 +708,385 @@ void convert_to_nvtxt(std::ifstream& in, std::ostream& out, program_options cons
   }
 }
 
+#include "nvtxw_events.h"
+
+void convert_to_nvtxw(std::ifstream& in, nvtxwInterfaceCore_t *&nvtxwInterface, 
+  nvtxwSessionHandle_t& session, 
+  nvtxwStreamHandle_t& stream, 
+  program_options const& opts)
+{
+  nvtxwResultCode_t result = NVTXW3_RESULT_SUCCESS;
+  int errorCode = 0;  
+  struct marker_start {
+    uint64_t timestamp;
+    uint32_t process_id;
+    uint32_t thread_id;
+    uint32_t color;
+    uint32_t category;
+    std::string name;
+    std::string domain;
+  };
+  std::unordered_map<int, spark_rapids_jni::profiler::MarkerData const*> marker_data_map;
+  std::unordered_map<int, marker_start> marker_start_map;
+  std::unordered_map<std::string, nvtxwStreamHandle_t> domainToStreamMap;
+  size_t num_dropped_records = 0;
+  uint32_t api_process_id = 0;
+  while (!in.eof()) {
+    auto fb_ptr = read_flatbuffer(in);
+    auto records = validate_fb<spark_rapids_jni::profiler::ActivityRecords>(*fb_ptr, "ActivityRecords");
+    auto dropped = records->dropped();
+    if (dropped != nullptr) {
+      for (int i = 0; i < dropped->size(); ++i) {
+        auto d = dropped->Get(i);
+        num_dropped_records += d->num_dropped();
+      }
+    }
+    auto api = records->api();
+    if (api != nullptr) {
+      NvidiaNvtxw::cuptiApiEvent event;
+      for (int i = 0; i < api->size(); ++i) {
+        auto a = api->Get(i);
+        event.time_start = a->start();
+        event.time_stop = a->end();
+        event.kind = a->kind() + 1;
+        event.cbid = a->cbid();
+        event.process_id = a->process_id();
+        if (api_process_id == 0) {
+          api_process_id = a->process_id() & 0xffffff;
+        }
+        event.thread_id = a->thread_id() & 0xffffff;
+        event.correlation_id = a->correlation_id();
+        event.return_value = a->return_value();
+        nvtxPayloadData_t payloadData[] = {
+            {NvidiaNvtxw::PayloadSchemaId::cuptiApiId, sizeof(event), &event},
+        };
+        result = nvtxwInterface->EventWrite(stream, payloadData, std::extent<decltype(payloadData)>::value);
+        if (result != NVTXW3_RESULT_SUCCESS)
+        {
+            fprintf(stderr, "API EventWrite failed with code %d\n", (int)result);
+            errorCode |= 4;
+        }        
+      }
+    }
+    auto device = records->device();
+    if (device != nullptr) {
+      NvidiaNvtxw::cuptiDevice event;
+      for(int i = 0; i < device->size(); ++i) {
+        auto d = device->Get(i);
+        event.global_memory_bandwidth = d->global_memory_bandwidth();
+        event.global_memory_size = d->global_memory_size();
+        event.constant_memory_size = d->constant_memory_size();
+        event.l2_cache_size = d->l2_cache_size();
+        event.num_threads_per_warp = d->num_threads_per_warp();
+        event.core_clock_rate = d->core_clock_rate();
+        event.num_memcpy_engines = d->num_memcpy_engines();
+        event.num_multiprocessors = d->num_multiprocessors();
+        event.max_ipc = d->max_ipc();
+        event.max_warps_per_multiprocessor = d->max_warps_per_multiprocessor();
+        event.max_blocks_per_multiprocessor = d->max_blocks_per_multiprocessor();
+        event.max_shared_memory_per_multiprocessor = d->max_shared_memory_per_multiprocessor();
+        event.max_registers_per_multiprocessor = d->max_registers_per_multiprocessor();
+        event.max_registers_per_block = d->max_registers_per_block();
+        event.max_shared_memory_per_block = d->max_shared_memory_per_block();
+        event.max_threads_per_block = d->max_threads_per_block();
+        event.max_block_dim_x = d->max_block_dim_x();
+        event.max_block_dim_y = d->max_block_dim_y();
+        event.max_block_dim_z = d->max_block_dim_z();
+        event.max_grid_dim_x = d->max_grid_dim_x();
+        event.max_grid_dim_y = d->max_grid_dim_y();
+        event.max_grid_dim_z = d->max_grid_dim_z();
+        event.compute_capability_major = d->compute_capability_major();
+        event.compute_capability_minor = d->compute_capability_minor();
+        event.id = d->id();
+        event.ecc_enabled = d->ecc_enabled();
+        event.name = d->name()->c_str();
+        nvtxPayloadData_t payloadData[] = {
+            {NvidiaNvtxw::PayloadSchemaId::nameId, strlen(event.name)+1, event.name},
+            {NvidiaNvtxw::PayloadSchemaId::cuptiDeviceId, sizeof(event), &event},
+        };
+        result = nvtxwInterface->EventWrite(stream, payloadData, std::extent<decltype(payloadData)>::value);
+        if (result != NVTXW3_RESULT_SUCCESS)
+        {
+            fprintf(stderr, "Cupti Device EventWrite failed with code %d\n", (int)result);
+            errorCode |= 4;
+        }                
+      }
+    }
+    auto marker_data = records->marker_data();
+    if (marker_data != nullptr) {
+      for (int i = 0; i < marker_data->size(); ++i) {
+        auto m = marker_data->Get(i);
+        auto [it, inserted] = marker_data_map.insert({m->id(), m});
+        if (not inserted) {
+          std::ostringstream oss;
+          oss << "duplicate marker data for " << m->id();
+          throw std::runtime_error(oss.str());
+        }
+      }
+    }
+    auto marker = records->marker();
+    if (marker != nullptr) {
+      nvtxwStreamHandle_t nvtxStream;
+      for (int i = 0; i < marker->size(); ++i) {
+        auto m = marker->Get(i);
+        auto object_id = m->object_id();
+        if (object_id != nullptr) {
+          uint32_t process_id = object_id->process_id();
+          uint32_t thread_id = object_id->thread_id();
+          if (m->flags() & spark_rapids_jni::profiler::MarkerFlags_Start) {
+            auto it = marker_data_map.find(m->id());
+            uint32_t color = 0x444444;
+            uint32_t category = 0;
+            if (it != marker_data_map.end()) {
+              color = it->second->color();
+              category = it->second->category();
+            }
+            marker_start ms{m->timestamp(), process_id, thread_id, color, category, m->name()->str(), m->domain()->str()};
+            auto [ignored, inserted] = marker_start_map.insert({m->id(), ms});
+            if (not inserted) {
+              std::ostringstream oss;
+              oss << "duplicate marker start for ID " << m->id();
+              throw std::runtime_error(oss.str());
+            }
+          } else if (m->flags() & spark_rapids_jni::profiler::MarkerFlags_End) {
+            auto it = marker_start_map.find(m->id());
+            if (it != marker_start_map.end()) {
+              auto const& ms = it->second;
+              // use default stream unless nvtx range has a domain
+              nvtxStream = stream;
+              std::string domainStr(ms.domain);
+              if (!domainStr.empty())
+              {
+                auto domainStreamIt = domainToStreamMap.find(domainStr);
+                if (domainStreamIt != domainToStreamMap.end())
+                {
+                  // reuse existing stream for this domain
+                  nvtxStream = domainStreamIt->second;
+                }
+                else
+                {
+                  // create a new stream for this domain
+                  bool valid = createNvtxwStream(nvtxwInterface, session, domainStr, domainStr, nvtxStream);
+                  if (valid)
+                  {
+                    domainToStreamMap[domainStr] = nvtxStream;
+                  }
+                  else
+                  {
+                    fprintf(stderr, "createNvtxwStream failed for domain %s\n", domainStr.c_str());
+                    nvtxStream = stream;
+                    errorCode |= 1;
+                  }
+                }
+              }
+              NvidiaNvtxw::nvtxRangeEvent event;
+              event.time_start = ms.timestamp;
+              event.time_stop = m->timestamp();
+              event.name = ms.name.c_str();
+              event.process_id = ms.process_id & 0xffffff;
+              event.thread_id = ms.thread_id & 0xffffff;
+              event.color = ms.color;
+              nvtxPayloadData_t payloadData[] = {
+                  {NvidiaNvtxw::PayloadSchemaId::nameId, strlen(event.name)+1, event.name},
+                  {NvidiaNvtxw::PayloadSchemaId::nvtxRangePushPopId, sizeof(event), &event},
+              };
+              result = nvtxwInterface->EventWrite(nvtxStream, payloadData, std::extent<decltype(payloadData)>::value);
+              if (result != NVTXW3_RESULT_SUCCESS)
+              {
+                  fprintf(stderr, "NvtxRange EventWrite failed with code %d\n", (int)result);
+                  errorCode |= 4;
+              }        
+              marker_start_map.erase(it);
+            } else {
+              std::cerr << "Ignoring marker end without start for ID " << m->id() << std::endl;
+            }
+          } else {
+            std::cerr << "Ignoring marker with unsupported flags: " << m->flags() << std::endl;
+          }
+        } else {
+          std::cerr << "Marker " << m->id() << " has no object ID" << std::endl;
+        }
+      }
+    }
+    marker_data_map.clear();
+    auto kernel = records->kernel();
+    if (kernel != nullptr) {
+      NvidiaNvtxw::cuptiKernelEvent event;
+      for (int i = 0; i < kernel->size(); ++i) {
+        auto k = kernel->Get(i);
+        event.time_start = k->start();
+        event.time_stop = k->end();
+        event.completed = k->completed();
+        event.grid_id = k->grid_id();
+        event.queued = k->queued();
+        event.submitted = k->submitted();
+        event.graph_node_id = k->graph_node_id();
+        event.local_memory_total_v2 = k->local_memory_total_v2();
+        event.name = k->name()->c_str();
+        event.device_id = k->device_id();
+        event.context_id = k->context_id();
+        event.stream_id = k->stream_id();
+        event.process_id = api_process_id;
+        event.grid_x = k->grid_x();
+        event.grid_y = k->grid_y();
+        event.grid_z = k->grid_z();
+        event.block_x = k->block_x();
+        event.block_y = k->block_y();
+        event.block_z = k->block_z();
+        event.static_shared_memory = k->static_shared_memory();
+        event.dynamic_shared_memory = k->dynamic_shared_memory();
+        event.local_memory_per_thread = k->local_memory_per_thread();
+        event.local_memory_total = k->local_memory_total();
+        event.correlation_id = k->correlation_id();
+        event.shared_memory_executed = k->shared_memory_executed();
+        event.graph_id = k->graph_id();
+        event.channel_id = k->channel_id();
+        event.cluster_x = k->cluster_x();
+        event.cluster_y = k->cluster_y();
+        event.cluster_z = k->cluster_z();
+        event.cluster_scheduling_policy = k->cluster_scheduling_policy();
+        event.registers_per_thread = k->registers_per_thread();
+        event.requested = k->requested();
+        event.executed = k->executed();
+        event.shared_memory_config = k->shared_memory_config();
+        event.partitioned_global_cache_requested = k->partitioned_global_cache_requested();
+        event.partitioned_global_cache_executed = k->partitioned_global_cache_executed();
+        event.launch_type = k->launch_type();
+        event.is_shared_memory_carveout_requested = k->is_shared_memory_carveout_requested();
+        event.shared_memory_carveout_requested = k->shared_memory_carveout_requested();
+        event.shmem_limit_config = k->shmem_limit_config();
+        event.channel_type = k->channel_type();
+        nvtxPayloadData_t payloadData[] = {
+            {NvidiaNvtxw::PayloadSchemaId::nameId, strlen(event.name)+1, event.name},
+            {NvidiaNvtxw::PayloadSchemaId::cuptiKernelId, sizeof(event), &event},
+        };
+        result = nvtxwInterface->EventWrite(stream, payloadData, std::extent<decltype(payloadData)>::value);
+        if (result != NVTXW3_RESULT_SUCCESS)
+        {
+            fprintf(stderr, "Kernel EventWrite failed with code %d\n", (int)result);
+            errorCode |= 4;
+        }
+      }
+    }
+    auto memcpy = records->memcpy();
+    if (memcpy != nullptr) {
+      NvidiaNvtxw::cuptiMemcpyEvent event;
+      for (int i = 0; i < memcpy->size(); ++i) {
+        auto m = memcpy->Get(i);
+        event.time_start = m->start();
+        event.time_stop = m->end();
+        event.bytes = m->bytes();
+        event.graph_node_id = 0;
+        event.device_id = m->device_id();
+        event.context_id = m->context_id();
+        event.stream_id = m->stream_id();
+        event.process_id = api_process_id;
+        event.correlation_id = m->correlation_id();
+        event.runtime_correlation_id = m->runtime_correlation_id();
+        event.graph_id = 0;
+        event.channel_id = m->channel_id();
+        event.copy_kind = m->copy_kind();
+        event.src_kind = m->src_kind();
+        event.dst_kind = m->dst_kind();
+        event.channelType = m->channel_type();    
+        nvtxPayloadData_t payloadData[] = {
+            {NvidiaNvtxw::PayloadSchemaId::cuptiMemcpyId, sizeof(event), &event},
+        };
+        result = nvtxwInterface->EventWrite(stream, payloadData, std::extent<decltype(payloadData)>::value);
+        if (result != NVTXW3_RESULT_SUCCESS)
+        {
+            fprintf(stderr, "Memcpy EventWrite failed with code %d\n", (int)result);
+            errorCode |= 4;
+        }
+      }
+    }
+    auto memset = records->memset();
+    if (memset != nullptr) {
+      NvidiaNvtxw::cuptiMemsetEvent event;
+      for (int i = 0; i < memset->size(); ++i) {
+        auto m = memset->Get(i);
+        event.time_start = m->start();
+        event.time_stop = m->end();
+        event.bytes = m->bytes();
+        event.graph_node_id = 0;
+        event.device_id = m->device_id();
+        event.context_id = m->context_id();
+        event.stream_id = m->stream_id();        
+        event.process_id = api_process_id;
+        event.correlation_id = m->correlation_id();
+        event.graph_id = 0;
+        event.channel_id = m->channel_id();
+        event.value = m->value();
+        event.mem_kind = m->memory_kind();
+        event.flags = m->flags();
+        event.channelType = m->channel_type();    
+        nvtxPayloadData_t payloadData[] = {
+            {NvidiaNvtxw::PayloadSchemaId::cuptiMemsetId, sizeof(event), &event},
+        };
+        result = nvtxwInterface->EventWrite(stream, payloadData, std::extent<decltype(payloadData)>::value);
+        if (result != NVTXW3_RESULT_SUCCESS)
+        {
+            fprintf(stderr, "Memset EventWrite failed with code %d\n", (int)result);
+            errorCode |= 4;
+        }
+      }
+    }
+    auto overhead = records->overhead();
+    if (overhead != nullptr) {
+      NvidiaNvtxw::cuptiOverheadEvent event;
+      for (int i = 0; i < overhead->size(); ++i) {
+        auto o = overhead->Get(i);
+        auto object_id = o->object_id();
+        if (object_id != nullptr) {
+          event.time_start = o->start();
+          event.time_stop = o->end();
+          event.process_id = object_id->process_id() & 0xffffff;
+          event.thread_id = object_id->thread_id() & 0xffffff;
+          event.overhead_kind = o->overhead_kind();
+          nvtxPayloadData_t payloadData[] = {
+              {NvidiaNvtxw::PayloadSchemaId::cuptiOverheadId, sizeof(event), &event},
+          };
+          result = nvtxwInterface->EventWrite(stream, payloadData, std::extent<decltype(payloadData)>::value);
+          if (result != NVTXW3_RESULT_SUCCESS)
+          {
+              fprintf(stderr, "Overhead EventWrite failed with code %d\n", (int)result);
+              errorCode |= 4;
+          }
+        } else {
+          std::cerr << "Overhead activity has no object ID" << std::endl;
+        }
+      }
+    }
+    in.peek();
+  }
+  if (num_dropped_records) {
+    std::cerr << "Warning: " << num_dropped_records
+      << " records were noted as dropped in the profile" << std::endl;
+  }
+  for(auto it : domainToStreamMap)
+  {
+    result = nvtxwInterface->StreamClose(it.second);
+    if (result != NVTXW3_RESULT_SUCCESS)
+    {
+        fprintf(stderr, "StreamClose failed for domain %s with code %d\n", it.first.c_str(), (int)result);
+        errorCode |= 8;
+    }  
+  }
+  result = nvtxwInterface->StreamClose(stream);
+  if (result != NVTXW3_RESULT_SUCCESS)
+  {
+      fprintf(stderr, "StreamClose failed with code %d\n", (int)result);
+      errorCode |= 8;
+  }  
+}
+
 int main(int argc, char* argv[])
 {
   constexpr int RESULT_SUCCESS = 0;
   constexpr int RESULT_FAILURE = 1;
   constexpr int RESULT_USAGE   = 2;
   program_options opts;
+  int errorCode = 0;
   std::vector<std::string_view> files;
   if (argc < 2) {
     print_usage();
@@ -740,6 +1134,24 @@ int main(int argc, char* argv[])
       } else {
         convert_to_nvtxt(in, std::cout, opts);
       }
+    } else if (opts.nvtxw) {
+      if (opts.output_path) {
+        void * nvtxwModuleHandle = nullptr;
+        nvtxwInterfaceCore_t *nvtxwInterface = nullptr;
+        nvtxwSessionHandle_t session;
+        nvtxwStreamHandle_t stream;
+        errorCode = initialize_nvtxw(in, opts.output_path.value().stem(), nvtxwModuleHandle, nvtxwInterface, session, stream);
+        if (errorCode == 0) {
+          convert_to_nvtxw(in, nvtxwInterface, session, stream, opts);
+          nvtxwResultCode_t result = nvtxwInterface->SessionEnd(session);
+          if (result != NVTXW3_RESULT_SUCCESS)
+          {
+              fprintf(stderr, "SessionEnd failed with code %d\n", (int)result);
+              return RESULT_FAILURE;
+          }
+        }
+        nvtxwUnload(nvtxwModuleHandle);
+      }
     } else {
       convert_to_nsys_rep(in, input_file, opts);
     }