diff --git a/src/main/cpp/profiler/CMakeLists.txt b/src/main/cpp/profiler/CMakeLists.txt index 03a552b3ea..915faca486 100644 --- a/src/main/cpp/profiler/CMakeLists.txt +++ b/src/main/cpp/profiler/CMakeLists.txt @@ -77,6 +77,11 @@ configure_file( add_executable(spark_rapids_profile_converter spark_rapids_profile_converter.cpp + initialize_nvtxw.cpp + nvtxw3.cpp + nvtxw3.h + NvtxwEvents.cpp + NvtxwEvents.h "${SPARK_RAPIDS_JNI_GENERATED_SOURCE_DIR}/profiler_schema.cpp" "${SPARK_RAPIDS_JNI_GENERATED_SOURCE_DIR}/spark_rapids_jni_version.cpp" "${SPARK_RAPIDS_JNI_GENERATED_INCLUDE_DIR}/profiler_generated.h" @@ -86,6 +91,8 @@ target_include_directories( spark_rapids_profile_converter PRIVATE "${CUDAToolkit_INCLUDE_DIRS}" + "${SPARK_RAPIDS_JNI_SOURCE_DIR}" + "${SPARK_RAPIDS_JNI_SOURCE_DIR}/profiler" "${SPARK_RAPIDS_JNI_SOURCE_DIR}/src" "${SPARK_RAPIDS_JNI_GENERATED_INCLUDE_DIR}" ) diff --git a/src/main/cpp/profiler/NvtxwEvents.cpp b/src/main/cpp/profiler/NvtxwEvents.cpp new file mode 100644 index 0000000000..5cd0873d00 --- /dev/null +++ b/src/main/cpp/profiler/NvtxwEvents.cpp @@ -0,0 +1,413 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Licensed under the Apache License v2.0 with LLVM Exceptions. + * See LICENSE.txt for license information. + */ + +#include + +#include "NvtxwEvents.h" + +namespace NvidiaNvtxw +{ + +#define PAYLOAD_ENTRY_SIMPLE(flags, type, name) \ + { (flags), (type), (name), nullptr, 0, 0, nullptr, nullptr } + +// The C string containing the event's name must be provided in a special way. +static const nvtxPayloadSchemaEntry_t nameSchema[] = { + PAYLOAD_ENTRY_SIMPLE( + NVTX_PAYLOAD_ENTRY_FLAG_EVENT_MESSAGE | NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_ZERO_TERMINATED, + NVTX_PAYLOAD_ENTRY_TYPE_CSTRING, + "name" + ) +}; +static const nvtxPayloadSchemaAttr_t nameSchemaAttr{ + /*.fieldMask = */ + NVTX_PAYLOAD_SCHEMA_ATTR_TYPE | + NVTX_PAYLOAD_SCHEMA_ATTR_FLAGS | + NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES | + NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES | + NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID, + /*.name = */ + nullptr, + /*.type = */ + NVTX_PAYLOAD_SCHEMA_TYPE_DYNAMIC, + /*.flags = */ + NVTX_PAYLOAD_SCHEMA_FLAG_REFERENCED, + /*.entries = */ + nameSchema, + /*.numEntries = */ + std::extent::value, + /*.payloadStaticSize = */ + 0, + /*.packAlign = */ + 0, + /*.schemaId = */ + NvidiaNvtxw::PayloadSchemaId::nameId, + /*.extension = */ + nullptr + }; + +static const nvtxPayloadSchemaEntry_t nvtxRangeSchema[] = { + PAYLOAD_ENTRY_SIMPLE( + NVTX_PAYLOAD_ENTRY_FLAG_RANGE_BEGIN | NVTX_PAYLOAD_ENTRY_FLAG_EVENT_TIMESTAMP, + NVTX_PAYLOAD_ENTRY_TYPE_UINT64, + "time_start" + ), + PAYLOAD_ENTRY_SIMPLE( + NVTX_PAYLOAD_ENTRY_FLAG_RANGE_END | NVTX_PAYLOAD_ENTRY_FLAG_EVENT_TIMESTAMP, + NVTX_PAYLOAD_ENTRY_TYPE_UINT64, + "time_stop" + ), + PAYLOAD_ENTRY_SIMPLE( + NVTX_PAYLOAD_ENTRY_FLAG_EVENT_MESSAGE | NVTX_PAYLOAD_ENTRY_FLAG_POINTER, + NVTX_PAYLOAD_ENTRY_TYPE_CSTRING, + "name" + ), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_PID_UINT32, "process_id"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_TID_UINT32, "thread_id"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_COLOR_ARGB, "color"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT8, "push_pop") +}; +// TimeBase = Relative +static const nvtxPayloadSchemaAttr_t nvtxRangePushPopSchemaAttr = { + NVTX_PAYLOAD_SCHEMA_ATTR_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_TYPE | + NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES | NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES | + NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE | NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID, + "NVTX Range Push Pop Event", + NVTX_PAYLOAD_SCHEMA_TYPE_STATIC, + NVTX_PAYLOAD_SCHEMA_FLAG_NONE, + nvtxRangeSchema, + std::extent::value, + sizeof(struct NvidiaNvtxw::nvtxRangeEvent), + 0, + NvidiaNvtxw::PayloadSchemaId::nvtxRangePushPopId, + nullptr +}; +// TimeBase = Relative +static const nvtxPayloadSchemaAttr_t nvtxRangeStartEndSchemaAttr = { + NVTX_PAYLOAD_SCHEMA_ATTR_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_TYPE | + NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES | NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES | + NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE | NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID, + "NVTX Range Start End Event", + NVTX_PAYLOAD_SCHEMA_TYPE_STATIC, + NVTX_PAYLOAD_SCHEMA_FLAG_NONE, + nvtxRangeSchema, + std::extent::value, + sizeof(struct NvidiaNvtxw::nvtxRangeEvent), + 0, + NvidiaNvtxw::PayloadSchemaId::nvtxRangeStartEndId, + nullptr +}; + +static const nvtxPayloadSchemaEntry_t cuptiApiSchema[] = { + PAYLOAD_ENTRY_SIMPLE( + NVTX_PAYLOAD_ENTRY_FLAG_RANGE_BEGIN | NVTX_PAYLOAD_ENTRY_FLAG_EVENT_TIMESTAMP, + NVTX_PAYLOAD_ENTRY_TYPE_UINT64, + "time_start" + ), + PAYLOAD_ENTRY_SIMPLE( + NVTX_PAYLOAD_ENTRY_FLAG_RANGE_END | NVTX_PAYLOAD_ENTRY_FLAG_EVENT_TIMESTAMP, + NVTX_PAYLOAD_ENTRY_TYPE_UINT64, + "time_stop" + ), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "kind"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "cbid"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_PID_UINT32, "process_id"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_TID_UINT32, "thread_id"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "correlation_id"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "return_value") +}; +static const nvtxPayloadSchemaAttr_t cuptiApiSchemaAttr = { + NVTX_PAYLOAD_SCHEMA_ATTR_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_TYPE | + NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES | NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES | + NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE | NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID, + "CUPTI API Activity", + NVTX_PAYLOAD_SCHEMA_TYPE_STATIC, + NVTX_PAYLOAD_SCHEMA_FLAG_NONE, + cuptiApiSchema, + std::extent::value, + sizeof(struct NvidiaNvtxw::cuptiApiEvent), + 0, + NvidiaNvtxw::PayloadSchemaId::cuptiApiId, + nullptr +}; +static const nvtxPayloadSchemaEntry_t cuptiDeviceSchema[] = { + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT64, "global_memory_bandwidth"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT64, "global_memory_size"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "constant_memory_size"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "l2_cache_size"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "num_threads_per_warp"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "core_clock_rate"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "num_memcpy_engines"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "num_multiprocessors"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "max_ipc"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "max_warps_per_multiprocessor"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "max_blocks_per_multiprocessor"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "max_shared_memory_per_multiprocessor"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "max_registers_per_multiprocessor"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "max_registers_per_block"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "max_shared_memory_per_block"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "max_threads_per_block"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "max_block_dim_x"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "max_block_dim_y"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "max_block_dim_z"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "max_grid_dim_x"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "max_grid_dim_y"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "max_grid_dim_z"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "compute_capability_major"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "compute_capability_minor"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "id"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "ecc_enabled"), + PAYLOAD_ENTRY_SIMPLE( + NVTX_PAYLOAD_ENTRY_FLAG_EVENT_MESSAGE | NVTX_PAYLOAD_ENTRY_FLAG_POINTER, + NVTX_PAYLOAD_ENTRY_TYPE_CSTRING, + "name" + ) +}; +static const nvtxPayloadSchemaAttr_t cuptiDeviceSchemaAttr = { + NVTX_PAYLOAD_SCHEMA_ATTR_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_TYPE | + NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES | NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES | + NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE | NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID, + "CUPTI Device", + NVTX_PAYLOAD_SCHEMA_TYPE_STATIC, + NVTX_PAYLOAD_SCHEMA_FLAG_NONE, + cuptiDeviceSchema, + std::extent::value, + sizeof(struct NvidiaNvtxw::cuptiDevice), + 0, + NvidiaNvtxw::PayloadSchemaId::cuptiDeviceId, + nullptr +}; +static const nvtxPayloadSchemaEntry_t cuptiKernelSchema[] = { + PAYLOAD_ENTRY_SIMPLE( + NVTX_PAYLOAD_ENTRY_FLAG_RANGE_BEGIN | NVTX_PAYLOAD_ENTRY_FLAG_EVENT_TIMESTAMP, + NVTX_PAYLOAD_ENTRY_TYPE_UINT64, + "time_start" + ), + PAYLOAD_ENTRY_SIMPLE( + NVTX_PAYLOAD_ENTRY_FLAG_RANGE_END | NVTX_PAYLOAD_ENTRY_FLAG_EVENT_TIMESTAMP, + NVTX_PAYLOAD_ENTRY_TYPE_UINT64, + "time_stop" + ), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT64, "completed"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT64, "grid_id"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT64, "queued"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT64, "submitted"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT64, "graph_node_id"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT64, "local_memory_total_v2"), + PAYLOAD_ENTRY_SIMPLE( + NVTX_PAYLOAD_ENTRY_FLAG_EVENT_MESSAGE | NVTX_PAYLOAD_ENTRY_FLAG_POINTER, + NVTX_PAYLOAD_ENTRY_TYPE_CSTRING, + "name" + ), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "device_id"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "context_id"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "stream_id"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_PID_UINT32, "process_id"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "grid_x"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "grid_y"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "grid_z"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "block_x"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "block_y"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "block_z"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "static_shared_memory"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "dynamic_shared_memory"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "local_memory_per_thread"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "local_memory_total"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "correlation_id"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "shared_memory_executed"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "graph_id"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "channel_id"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "cluster_x"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "cluster_y"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "cluster_z"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "cluster_scheduling_policy"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT16, "registers_per_thread"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT8, "requested"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT8, "executed"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT8, "shared_memory_config"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT8, "partitioned_global_cache_requested"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT8, "partitioned_global_cache_executed"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT8, "launch_type"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT8, "is_shared_memory_carveout_requested"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT8, "shared_memory_carveout_requested"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT8, "shmem_limit_config"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT8, "channel_type") +}; +static const nvtxPayloadSchemaAttr_t cuptiKernelSchemaAttr = { + NVTX_PAYLOAD_SCHEMA_ATTR_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_TYPE | + NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES | NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES | + NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE | NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID, + "CUPTI Kernel", + NVTX_PAYLOAD_SCHEMA_TYPE_STATIC, + NVTX_PAYLOAD_SCHEMA_FLAG_NONE, + cuptiKernelSchema, + std::extent::value, + sizeof(struct NvidiaNvtxw::cuptiKernelEvent), + 0, + NvidiaNvtxw::PayloadSchemaId::cuptiKernelId, + nullptr +}; +static const nvtxPayloadSchemaEntry_t cuptiMemcpySchema[] = { + PAYLOAD_ENTRY_SIMPLE( + NVTX_PAYLOAD_ENTRY_FLAG_RANGE_BEGIN | NVTX_PAYLOAD_ENTRY_FLAG_EVENT_TIMESTAMP, + NVTX_PAYLOAD_ENTRY_TYPE_UINT64, + "time_start" + ), + PAYLOAD_ENTRY_SIMPLE( + NVTX_PAYLOAD_ENTRY_FLAG_RANGE_END | NVTX_PAYLOAD_ENTRY_FLAG_EVENT_TIMESTAMP, + NVTX_PAYLOAD_ENTRY_TYPE_UINT64, + "time_stop" + ), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT64, "bytes"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT64, "graph_node_id"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "device_id"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "context_id"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "stream_id"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_PID_UINT32, "process_id"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "correlation_id"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "runtime_correlation_id"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "graph_id"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "channel_id"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT8, "channel_type"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT8, "copy_kind"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT8, "src_kind"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT8, "dst_kind") +}; +static const nvtxPayloadSchemaAttr_t cuptiMemcpySchemaAttr = { + NVTX_PAYLOAD_SCHEMA_ATTR_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_TYPE | + NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES | NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES | + NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE | NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID, + "CUPTI Memcpy", + NVTX_PAYLOAD_SCHEMA_TYPE_STATIC, + NVTX_PAYLOAD_SCHEMA_FLAG_NONE, + cuptiMemcpySchema, + std::extent::value, + sizeof(struct NvidiaNvtxw::cuptiMemcpyEvent), + 0, + NvidiaNvtxw::PayloadSchemaId::cuptiMemcpyId, + nullptr +}; +static const nvtxPayloadSchemaEntry_t cuptiMemsetSchema[] = { + PAYLOAD_ENTRY_SIMPLE( + NVTX_PAYLOAD_ENTRY_FLAG_RANGE_BEGIN | NVTX_PAYLOAD_ENTRY_FLAG_EVENT_TIMESTAMP, + NVTX_PAYLOAD_ENTRY_TYPE_UINT64, + "time_start" + ), + PAYLOAD_ENTRY_SIMPLE( + NVTX_PAYLOAD_ENTRY_FLAG_RANGE_END | NVTX_PAYLOAD_ENTRY_FLAG_EVENT_TIMESTAMP, + NVTX_PAYLOAD_ENTRY_TYPE_UINT64, + "time_stop" + ), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT64, "bytes"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT64, "graph_node_id"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "device_id"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "context_id"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "stream_id"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_PID_UINT32, "process_id"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "correlation_id"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "graph_id"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "channel_id"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT32, "value"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT8, "channel_type"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT8, "mem_kind"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT8, "flags") +}; +static const nvtxPayloadSchemaAttr_t cuptiMemsetSchemaAttr = { + NVTX_PAYLOAD_SCHEMA_ATTR_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_TYPE | + NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES | NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES | + NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE | NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID, + "CUPTI Memset", + NVTX_PAYLOAD_SCHEMA_TYPE_STATIC, + NVTX_PAYLOAD_SCHEMA_FLAG_NONE, + cuptiMemsetSchema, + std::extent::value, + sizeof(struct NvidiaNvtxw::cuptiMemsetEvent), + 0, + NvidiaNvtxw::PayloadSchemaId::cuptiMemsetId, + nullptr +}; +static const nvtxPayloadSchemaEntry_t cuptiOverheadSchema[] = { + PAYLOAD_ENTRY_SIMPLE( + NVTX_PAYLOAD_ENTRY_FLAG_RANGE_BEGIN | NVTX_PAYLOAD_ENTRY_FLAG_EVENT_TIMESTAMP, + NVTX_PAYLOAD_ENTRY_TYPE_UINT64, + "time_start" + ), + PAYLOAD_ENTRY_SIMPLE( + NVTX_PAYLOAD_ENTRY_FLAG_RANGE_END | NVTX_PAYLOAD_ENTRY_FLAG_EVENT_TIMESTAMP, + NVTX_PAYLOAD_ENTRY_TYPE_UINT64, + "time_stop" + ), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_PID_UINT32, "process_id"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_TID_UINT32, "thread_id"), + PAYLOAD_ENTRY_SIMPLE(0, NVTX_PAYLOAD_ENTRY_TYPE_UINT8, "overhead_kind"), +}; +static const nvtxPayloadSchemaAttr_t cuptiOverheadSchemaAttr = { + NVTX_PAYLOAD_SCHEMA_ATTR_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_TYPE | + NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES | NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES | + NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE | NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID, + "CUPTI Overhead", + NVTX_PAYLOAD_SCHEMA_TYPE_STATIC, + NVTX_PAYLOAD_SCHEMA_FLAG_NONE, + cuptiOverheadSchema, + std::extent::value, + sizeof(struct NvidiaNvtxw::cuptiOverheadEvent), + 0, + NvidiaNvtxw::PayloadSchemaId::cuptiOverheadId, + nullptr +}; +#undef PAYLOAD_ENTRY_SIMPLE + +const nvtxPayloadSchemaAttr_t* GetNameSchemaAttr() +{ + return &nameSchemaAttr; +} +const nvtxPayloadSchemaAttr_t* GetNvtxRangePushPopSchemaAttr() +{ + return &nvtxRangePushPopSchemaAttr; +} +const nvtxPayloadSchemaAttr_t* GetNvtxRangeStartEndSchemaAttr() +{ + return &nvtxRangeStartEndSchemaAttr; +} +const nvtxPayloadSchemaAttr_t* GetCuptiApiSchemaAttr() +{ + return &cuptiApiSchemaAttr; +} +const nvtxPayloadSchemaAttr_t* GetCuptiDeviceSchemaAttr() +{ + return &cuptiDeviceSchemaAttr; +} +const nvtxPayloadSchemaAttr_t* GetCuptiKernelSchemaAttr() +{ + return &cuptiKernelSchemaAttr; +} +const nvtxPayloadSchemaAttr_t* GetCuptiMemcpySchemaAttr() +{ + return &cuptiMemcpySchemaAttr; +} +const nvtxPayloadSchemaAttr_t* GetCuptiMemsetSchemaAttr() +{ + return &cuptiMemsetSchemaAttr; +} +const nvtxPayloadSchemaAttr_t* GetCuptiOverheadSchemaAttr() +{ + return &cuptiOverheadSchemaAttr; +} +} \ No newline at end of file diff --git a/src/main/cpp/profiler/NvtxwEvents.h b/src/main/cpp/profiler/NvtxwEvents.h new file mode 100644 index 0000000000..6c6bff8304 --- /dev/null +++ b/src/main/cpp/profiler/NvtxwEvents.h @@ -0,0 +1,188 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Licensed under the Apache License v2.0 with LLVM Exceptions. + * See LICENSE.txt for license information. + */ + +#pragma once + +#include +#include + +namespace NvidiaNvtxw +{ + +namespace PayloadSchemaId +{ + static constexpr uint64_t nameId = 0xffffff00; + static constexpr uint64_t nvtxRangePushPopId = 0xffffff01; + static constexpr uint64_t cuptiApiId = 0xffffff02; + static constexpr uint64_t cuptiMemcpyId = 0xffffff03; + static constexpr uint64_t cuptiMemsetId = 0xffffff04; + static constexpr uint64_t cuptiDeviceId = 0xffffff05; + static constexpr uint64_t cuptiKernelId = 0xffffff06; + static constexpr uint64_t cuptiOverheadId = 0xffffff07; + static constexpr uint64_t nvtxRangeStartEndId = 0xffffff08; +}; + +const nvtxPayloadSchemaAttr_t* GetNameSchemaAttr(); + +struct nvtxRangeEvent { + uint64_t time_start; + uint64_t time_stop; + const char* name; + uint32_t process_id; + uint32_t thread_id; + uint32_t color; +}; +const nvtxPayloadSchemaAttr_t* GetNvtxRangePushPopSchemaAttr(); +const nvtxPayloadSchemaAttr_t* GetNvtxRangeStartEndSchemaAttr(); +struct cuptiApiEvent { + uint64_t time_start; + uint64_t time_stop; + uint32_t kind; + uint32_t cbid; + uint32_t process_id; + uint32_t thread_id; + uint32_t correlation_id; + uint32_t return_value; +}; +const nvtxPayloadSchemaAttr_t* GetCuptiApiSchemaAttr(); +struct cuptiDevice { + uint64_t global_memory_bandwidth; + uint64_t global_memory_size; + uint32_t constant_memory_size; + uint32_t l2_cache_size; + uint32_t num_threads_per_warp; + uint32_t core_clock_rate; + uint32_t num_memcpy_engines; + uint32_t num_multiprocessors; + uint32_t max_ipc; + uint32_t max_warps_per_multiprocessor; + uint32_t max_blocks_per_multiprocessor; + uint32_t max_shared_memory_per_multiprocessor; + uint32_t max_registers_per_multiprocessor; + uint32_t max_registers_per_block; + uint32_t max_shared_memory_per_block; + uint32_t max_threads_per_block; + uint32_t max_block_dim_x; + uint32_t max_block_dim_y; + uint32_t max_block_dim_z; + uint32_t max_grid_dim_x; + uint32_t max_grid_dim_y; + uint32_t max_grid_dim_z; + uint32_t compute_capability_major; + uint32_t compute_capability_minor; + uint32_t id; + uint32_t ecc_enabled; + const char* name; +}; +const nvtxPayloadSchemaAttr_t* GetCuptiDeviceSchemaAttr(); +struct cuptiKernelEvent { + uint64_t time_start; + uint64_t time_stop; + uint64_t completed; + uint64_t grid_id; + uint64_t queued; + uint64_t submitted; + uint64_t graph_node_id; + uint64_t local_memory_total_v2; + const char * name; + uint32_t device_id; + uint32_t context_id; + uint32_t stream_id; + uint32_t process_id; + uint32_t grid_x; + uint32_t grid_y; + uint32_t grid_z; + uint32_t block_x; + uint32_t block_y; + uint32_t block_z; + uint32_t static_shared_memory; + uint32_t dynamic_shared_memory; + uint32_t local_memory_per_thread; + uint32_t local_memory_total; + uint32_t correlation_id; + uint32_t shared_memory_executed; + uint32_t graph_id; + uint32_t channel_id; + uint32_t cluster_x; + uint32_t cluster_y; + uint32_t cluster_z; + uint32_t cluster_scheduling_policy; + uint16_t registers_per_thread; + uint8_t requested; + uint8_t executed; + uint8_t shared_memory_config; + uint8_t partitioned_global_cache_requested; + uint8_t partitioned_global_cache_executed; + uint8_t launch_type; + uint8_t is_shared_memory_carveout_requested; + uint8_t shared_memory_carveout_requested; + uint8_t shmem_limit_config; + uint8_t channel_type; +}; +const nvtxPayloadSchemaAttr_t* GetCuptiKernelSchemaAttr(); + +struct cuptiMemcpyEvent { + uint64_t time_start; + uint64_t time_stop; + uint64_t bytes; + uint64_t graph_node_id; + uint32_t device_id; + uint32_t context_id; + uint32_t stream_id; + uint32_t process_id; + uint32_t correlation_id; + uint32_t runtime_correlation_id; + uint32_t graph_id; + uint32_t channel_id; + uint8_t channelType; + uint8_t copy_kind; + uint8_t src_kind; + uint8_t dst_kind; +}; +const nvtxPayloadSchemaAttr_t* GetCuptiMemcpySchemaAttr(); + +struct cuptiMemsetEvent { + uint64_t time_start; + uint64_t time_stop; + uint64_t bytes; + uint64_t graph_node_id; + uint32_t device_id; + uint32_t context_id; + uint32_t stream_id; + uint32_t process_id; + uint32_t correlation_id; + uint32_t graph_id; + uint32_t channel_id; + uint32_t value; + uint8_t channelType; + uint8_t mem_kind; + uint8_t flags; +}; +const nvtxPayloadSchemaAttr_t* GetCuptiMemsetSchemaAttr(); +struct cuptiOverheadEvent { + uint64_t time_start; + uint64_t time_stop; + uint32_t process_id; + uint32_t thread_id; + uint8_t overhead_kind; +}; +const nvtxPayloadSchemaAttr_t* GetCuptiOverheadSchemaAttr(); + +} diff --git a/src/main/cpp/profiler/README-nvtxw.txt b/src/main/cpp/profiler/README-nvtxw.txt new file mode 100644 index 0000000000..87d6f37406 --- /dev/null +++ b/src/main/cpp/profiler/README-nvtxw.txt @@ -0,0 +1,22 @@ +1. NvtxwEvents.h, NvtxwEvents.cpp are copied from Nsight Systems source code. They need to be kept in sync between this project and Nsight Systems. + +2. Need to set the NVTXW_BACKEND environment variable for the libNvtxwBackend.so library in the host directory a current build of Nsight Systems. For example: + > export NVTXW_BACKEND=/opt/nvidia/nsight-systems/2024.6.0/host-linux-x64/libNvtxwBackend.so + +3. Run like this: + > ./target/jni/cmake-build/profiler/spark_rapids_profile_converter -w -o file3021460.nsys-rep rapids-profile-3021460@jlowe-lcedt-driver.bin + and get output similar to this: + Backend implementation loaded! Applying config string... + Loader config key/value pairs not provided + Creating report: "file3021460.nsys-rep" + - Created session: file3021460 + Session config key/value pairs not provided + - Created stream: Stream1 + Domain: SparkRAPIDS + Scope: + - Destroyed stream: Stream1 + 3946 events imported + - Destroyed session: file3021460 + Backend implementation prepared for unload. + +4. Load into nsight systems UI: nsys-ui file3021460.nsys-rep \ No newline at end of file diff --git a/src/main/cpp/profiler/initialize_nvtxw.cpp b/src/main/cpp/profiler/initialize_nvtxw.cpp new file mode 100644 index 0000000000..218b6c9785 --- /dev/null +++ b/src/main/cpp/profiler/initialize_nvtxw.cpp @@ -0,0 +1,202 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Licensed under the Apache License v2.0 with LLVM Exceptions. + * See LICENSE.txt for license information. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "nvtxw_events.h" + +bool createNvtxwStream(const nvtxwInterfaceCore_t *nvtxwInterface, + const nvtxwSessionHandle_t& session, + const std::string & name, + const std::string & domain, + nvtxwStreamHandle_t & stream) +{ + nvtxwResultCode_t result = NVTXW3_RESULT_SUCCESS; + nvtxwStreamAttributes_t streamAttr = { + sizeof(nvtxwStreamAttributes_t), + name.c_str(), + domain.c_str(), + "", + NVTXW3_STREAM_ORDER_INTERLEAVING_NONE, + NVTXW3_STREAM_ORDERING_TYPE_UNKNOWN, + NVTXW3_STREAM_ORDERING_SKID_NONE, + 0 + }; + result = nvtxwInterface->StreamOpen(&stream, session, &streamAttr); + if (result != NVTXW3_RESULT_SUCCESS) + { + fprintf(stderr, "StreamOpen failed with code %d\n", (int)result); + return false; + } + if (!stream.opaque) + { + fprintf(stderr, "StreamOpen returned null stream handle!\n"); + return false; + } + return true; +} + +/// outName: basename of output nsys-rep, without .nsys-rep extension +int initialize_nvtxw(std::ifstream& in, const std::string& outName, + void *& nvtxwModuleHandle, + nvtxwInterfaceCore_t *&nvtxwInterface, + nvtxwSessionHandle_t &session, + nvtxwStreamHandle_t &stream) { + nvtxwResultCode_t result = NVTXW3_RESULT_SUCCESS; + int errorCode = 0; + // initialize + static const char soNameDefault[] = "libNvtxwBackend.so"; + const char *soName = soNameDefault; + const char *backend_env = getenv("NVTXW_BACKEND"); + if (backend_env) + { + soName = backend_env; + } + nvtxwGetInterface_t getInterfaceFunc = nullptr; + result = nvtxwInitialize( + NVTXW3_INIT_MODE_LIBRARY_FILENAME, + soName, + &getInterfaceFunc, + &nvtxwModuleHandle); + if (result != NVTXW3_RESULT_SUCCESS) + { + fprintf(stderr, "nvtxwInitialize failed with code %d\n", (int)result); + if (result == NVTXW3_RESULT_LIBRARY_NOT_FOUND) + fprintf(stderr, "Failed to find %s\n", soName); + return 1; + } + if (!getInterfaceFunc) + { + fprintf(stderr, "nvtxwInitialize returned null nvtxwGetInterface_t!\n"); + return 1; + } + + const void* interfaceVoid; + result = getInterfaceFunc( + NVTXW3_INTERFACE_ID_CORE_V1, + &interfaceVoid); + if (result != NVTXW3_RESULT_SUCCESS) + { + fprintf(stderr, "getInterfaceFunc failed with code %d\n", (int)result); + return 1; + } + if (!interfaceVoid) + { + fprintf(stderr, "getInterfaceFunc returned null nvtxwInterface pointer!\n"); + return 1; + } + nvtxwInterface = reinterpret_cast((void*)interfaceVoid); + + // session begin + char* sessionConfig = nullptr; + nvtxwSessionAttributes_t sessionAttr = { + sizeof(nvtxwSessionAttributes_t), + outName.c_str(), + sessionConfig + }; + result = nvtxwInterface->SessionBegin(&session, &sessionAttr); + free(sessionConfig); + if (result != NVTXW3_RESULT_SUCCESS) + { + fprintf(stderr, "SessionBegin failed with code %d\n", (int)result); + return 1; + } + if (!session.opaque) + { + fprintf(stderr, "SessionBegin returned null session handle!\n"); + return 1; + } + + // stream open + std::string streamName("CUPTI"); + std::string domainName("CUPTI"); + bool valid = createNvtxwStream(nvtxwInterface, session, streamName, domainName, stream); + if (!valid) + { + errorCode |= 1; + return errorCode; + } + // schema register + result = nvtxwInterface->SchemaRegister(stream, NvidiaNvtxw::GetNameSchemaAttr()); + if (result != NVTXW3_RESULT_SUCCESS) + { + fprintf(stderr, "SchemaRegister failed for 'nameSchema' with code %d\n", (int)result); + errorCode |= 2; + } + result = nvtxwInterface->SchemaRegister(stream, NvidiaNvtxw::GetNvtxRangePushPopSchemaAttr()); + if (result != NVTXW3_RESULT_SUCCESS) + { + fprintf(stderr, "SchemaRegister failed with 'nvtxRangePushPopSchema' with code %d\n", (int)result); + errorCode |= 2; + } + result = nvtxwInterface->SchemaRegister(stream, NvidiaNvtxw::GetCuptiApiSchemaAttr()); + if (result != NVTXW3_RESULT_SUCCESS) + { + fprintf(stderr, "SchemaRegister failed with 'cuptiApiSchema' with code %d\n", (int)result); + errorCode |= 2; + } + result = nvtxwInterface->SchemaRegister(stream, NvidiaNvtxw::GetCuptiDeviceSchemaAttr()); + if (result != NVTXW3_RESULT_SUCCESS) + { + fprintf(stderr, "SchemaRegister failed with 'cuptiDeviceSchema' with code %d\n", (int)result); + errorCode |= 2; + } + result = nvtxwInterface->SchemaRegister(stream, NvidiaNvtxw::GetCuptiKernelSchemaAttr()); + if (result != NVTXW3_RESULT_SUCCESS) + { + fprintf(stderr, "SchemaRegister failed with 'cuptiKernelSchema' with code %d\n", (int)result); + errorCode |= 2; + } + result = nvtxwInterface->SchemaRegister(stream, NvidiaNvtxw::GetCuptiMemcpySchemaAttr()); + if (result != NVTXW3_RESULT_SUCCESS) + { + fprintf(stderr, "SchemaRegister failed with 'cuptiMemcpySchema' with code %d\n", (int)result); + errorCode |= 2; + } + result = nvtxwInterface->SchemaRegister(stream, NvidiaNvtxw::GetCuptiMemsetSchemaAttr()); + if (result != NVTXW3_RESULT_SUCCESS) + { + fprintf(stderr, "SchemaRegister failed with 'cuptiMemsetSchema' with code %d\n", (int)result); + errorCode |= 2; + } + result = nvtxwInterface->SchemaRegister(stream, NvidiaNvtxw::GetCuptiOverheadSchemaAttr()); + if (result != NVTXW3_RESULT_SUCCESS) + { + fprintf(stderr, "SchemaRegister failed with 'cuptiOverheadSchema' with code %d\n", (int)result); + errorCode |= 2; + } + return errorCode; +} diff --git a/src/main/cpp/profiler/nvtx3/nvToolsExtPayload.h b/src/main/cpp/profiler/nvtx3/nvToolsExtPayload.h new file mode 100644 index 0000000000..3c750f7b13 --- /dev/null +++ b/src/main/cpp/profiler/nvtx3/nvToolsExtPayload.h @@ -0,0 +1,1173 @@ +/* +* Copyright 2021-2024 NVIDIA Corporation. All rights reserved. +* +* Licensed under the Apache License v2.0 with LLVM Exceptions. +* See https://llvm.org/LICENSE.txt for license information. +* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +*/ + +#include "nvtx3/nvToolsExt.h" + +/* Optionally include helper macros. */ +/* #include "nvToolsExtPayloadHelper.h" */ + +/** + * If needed, semantic extension headers can be included after this header. + */ + +/** + * \brief The compatibility ID is used for versioning of this extension. + */ +#ifndef NVTX_EXT_PAYLOAD_COMPATID +#define NVTX_EXT_PAYLOAD_COMPATID 0x0104 +#endif + +/** + * \brief The module ID identifies the payload extension. It has to be unique + * among the extension modules. + */ +#ifndef NVTX_EXT_PAYLOAD_MODULEID +#define NVTX_EXT_PAYLOAD_MODULEID 2 +#endif + +/** + * \brief Additional value for the enum @ref nvtxPayloadType_t + */ +#ifndef NVTX_PAYLOAD_TYPE_EXT +#define NVTX_PAYLOAD_TYPE_EXT ((int32_t)0xDFBD0009) +#endif + +/** --------------------------------------------------------------------------- + * Payload schema entry flags. Used for @ref nvtxPayloadSchemaEntry_t::flags. + * ------------------------------------------------------------------------- */ +#ifndef NVTX_PAYLOAD_ENTRY_FLAGS_V1 +#define NVTX_PAYLOAD_ENTRY_FLAGS_V1 + +#define NVTX_PAYLOAD_ENTRY_FLAG_UNUSED 0 + +/** + * Absolute pointer into a payload (entry) of the same event. + */ +#define NVTX_PAYLOAD_ENTRY_FLAG_POINTER (1 << 1) + +/** + * Offset from base address of the payload. + */ +#define NVTX_PAYLOAD_ENTRY_FLAG_OFFSET_FROM_BASE (1 << 2) + +/** + * Offset from the end of this payload entry. + */ +#define NVTX_PAYLOAD_ENTRY_FLAG_OFFSET_FROM_HERE (1 << 3) + +/** + * The value is an array with fixed length, set with the field `arrayLength`. + */ +#define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_FIXED_SIZE (1 << 4) + +/** + * The value is a zero-/null-terminated array. + */ +#define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_ZERO_TERMINATED (2 << 4) + +/** + * \brief A single or multi-dimensional array of variable length. + * + * The field `arrayOrUnionDetail` contains the index of the schema entry that + * holds the length(s). If the length entry is a scalar, then this entry is a 1D + * array. If the length entry is a fixed-size array, then the number of + * dimensions is defined with the registration of the schema. If the length + * entry is a zero-terminated array, then the array of the dimensions can be + * determined at runtime. + * For multidimensional arrays, values are stored in row-major order, with rows + * being stored consecutively in contiguous memory. The size of the entry (in + * bytes) is the product of the dimensions multiplied with size of the array + * element. + */ +#define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_LENGTH_INDEX (3 << 4) + +/** + * \brief A single or multi-dimensional array of variable length, where the + * dimensions are stored in a different payload (index) of the same event. + * + * This enables an existing address to an array to be directly passed, while the + * dimensions are defined in a separate payload (with only one payload entry). + */ +#define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_LENGTH_PAYLOAD_INDEX (4 << 4) + +/** + * \brief The value or data that is pointed to by this payload entry value shall + * be copied by the NVTX handler. + * + * A tool may not support deep copy and just ignore this flag. + * See @ref NVTX_PAYLOAD_SCHEMA_FLAG_DEEP_COPY for more details. + */ +#define NVTX_PAYLOAD_ENTRY_FLAG_DEEP_COPY (1 << 8) + +/** + * Notifies the NVTX handler to hide this entry in case of visualization. + */ +#define NVTX_PAYLOAD_ENTRY_FLAG_HIDE (1 << 9) + +/** + * The entry specifies the event message. Any string type can be used. + */ +#define NVTX_PAYLOAD_ENTRY_FLAG_EVENT_MESSAGE (1 << 10) + +/** + * \brief The entry contains an event timestamp. + * + * The time source might be provided via the entry semantics field. In most + * cases, the timestamp (entry) type is @ref NVTX_PAYLOAD_ENTRY_TYPE_UINT64. + */ +#define NVTX_PAYLOAD_ENTRY_FLAG_EVENT_TIMESTAMP (2 << 10) + +/** + * These flags specify the NVTX event type to which an entry refers. + */ +#define NVTX_PAYLOAD_ENTRY_FLAG_RANGE_BEGIN (1 << 12) +#define NVTX_PAYLOAD_ENTRY_FLAG_RANGE_END (2 << 12) +#define NVTX_PAYLOAD_ENTRY_FLAG_MARK (3 << 12) +#define NVTX_PAYLOAD_ENTRY_FLAG_COUNTER (4 << 12) + +#endif /* NVTX_PAYLOAD_ENTRY_FLAGS_V1 */ +/** --------------------------------------------------------------------------- + * END: Payload schema entry flags. + * ------------------------------------------------------------------------- */ + +/** \todo: Keep this in the header? */ +/** + * @note The ‘array’ flags assume that the array is embedded. Otherwise, + * @ref NVTX_PAYLOAD_ENTRY_FLAG_POINTER has to be additionally specified. Some + * combinations may be invalid based on the `NVTX_PAYLOAD_SCHEMA_TYPE_*` this + * entry is enclosed. For instance, variable length embedded arrays are valid + * within @ref NVTX_PAYLOAD_SCHEMA_TYPE_DYNAMIC but invalid with + * @ref NVTX_PAYLOAD_SCHEMA_TYPE_STATIC. See `NVTX_PAYLOAD_SCHEMA_TYPE_*` for + * additional details. + */ + +/* Helper macro to check if an entry represents an array. */ +#define NVTX_PAYLOAD_ENTRY_FLAG_IS_ARRAY (\ + NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_FIXED_SIZE | \ + NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_ZERO_TERMINATED | \ + NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_LENGTH_INDEX) + +#define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_TYPE(F) \ + (F & NVTX_PAYLOAD_ENTRY_FLAG_IS_ARRAY) +/** \todo end */ + + +/** --------------------------------------------------------------------------- + * Types of entries in a payload schema. + * + * @note Several of the predefined types contain the size (in bits) in their + * names. For some data types the size (in bytes) is not fixed and may differ + * for different platforms/operating systems/compilers. To provide portability, + * an array of sizes (in bytes) for type 1 to 28 ( @ref + * NVTX_PAYLOAD_ENTRY_TYPE_CHAR to @ref NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE) + * is passed to the NVTX extension initialization function + * @ref InitializeInjectionNvtxExtension via the `extInfo` field of + * @ref nvtxExtModuleInfo_t. + * ------------------------------------------------------------------------- */ +#ifndef NVTX_PAYLOAD_ENTRY_TYPES_V1 +#define NVTX_PAYLOAD_ENTRY_TYPES_V1 + +#define NVTX_PAYLOAD_ENTRY_TYPE_INVALID 0 + +/** + * Basic integer types. + */ +#define NVTX_PAYLOAD_ENTRY_TYPE_CHAR 1 +#define NVTX_PAYLOAD_ENTRY_TYPE_UCHAR 2 +#define NVTX_PAYLOAD_ENTRY_TYPE_SHORT 3 +#define NVTX_PAYLOAD_ENTRY_TYPE_USHORT 4 +#define NVTX_PAYLOAD_ENTRY_TYPE_INT 5 +#define NVTX_PAYLOAD_ENTRY_TYPE_UINT 6 +#define NVTX_PAYLOAD_ENTRY_TYPE_LONG 7 +#define NVTX_PAYLOAD_ENTRY_TYPE_ULONG 8 +#define NVTX_PAYLOAD_ENTRY_TYPE_LONGLONG 9 +#define NVTX_PAYLOAD_ENTRY_TYPE_ULONGLONG 10 + +/** + * Integer types with explicit size. + */ +#define NVTX_PAYLOAD_ENTRY_TYPE_INT8 11 +#define NVTX_PAYLOAD_ENTRY_TYPE_UINT8 12 +#define NVTX_PAYLOAD_ENTRY_TYPE_INT16 13 +#define NVTX_PAYLOAD_ENTRY_TYPE_UINT16 14 +#define NVTX_PAYLOAD_ENTRY_TYPE_INT32 15 +#define NVTX_PAYLOAD_ENTRY_TYPE_UINT32 16 +#define NVTX_PAYLOAD_ENTRY_TYPE_INT64 17 +#define NVTX_PAYLOAD_ENTRY_TYPE_UINT64 18 + +/** + * Floating point types + */ +#define NVTX_PAYLOAD_ENTRY_TYPE_FLOAT 19 +#define NVTX_PAYLOAD_ENTRY_TYPE_DOUBLE 20 +#define NVTX_PAYLOAD_ENTRY_TYPE_LONGDOUBLE 21 + +/** + * Size type (`size_t` in C). + */ +#define NVTX_PAYLOAD_ENTRY_TYPE_SIZE 22 + +/** + * Any address, e.g. `void*`. If the pointer type matters, use the flag @ref + * NVTX_PAYLOAD_ENTRY_FLAG_POINTER and the respective type instead. + */ +#define NVTX_PAYLOAD_ENTRY_TYPE_ADDRESS 23 + +/** + * Special character types. + */ +#define NVTX_PAYLOAD_ENTRY_TYPE_WCHAR 24 /* wide character (since C90) */ +#define NVTX_PAYLOAD_ENTRY_TYPE_CHAR8 25 /* since C2x and C++20 */ +#define NVTX_PAYLOAD_ENTRY_TYPE_CHAR16 26 +#define NVTX_PAYLOAD_ENTRY_TYPE_CHAR32 27 + +/** + * There is type size and alignment information for all previous types. + */ +#define NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE (NVTX_PAYLOAD_ENTRY_TYPE_CHAR32 + 1) + +/** + * Store raw 8-bit binary data. As with `char`, 1-byte alignment is assumed. + * Typically, a tool will display this as hex or binary. + */ +#define NVTX_PAYLOAD_ENTRY_TYPE_BYTE 32 + +/** + * These types do not have standardized equivalents. It is assumed that the + * number at the end corresponds to the bits used to store the value and that + * the alignment corresponds to standardized types of the same size. + * A tool may not support these types. + */ +#define NVTX_PAYLOAD_ENTRY_TYPE_INT128 33 +#define NVTX_PAYLOAD_ENTRY_TYPE_UINT128 34 + +#define NVTX_PAYLOAD_ENTRY_TYPE_FLOAT16 42 +#define NVTX_PAYLOAD_ENTRY_TYPE_FLOAT32 43 +#define NVTX_PAYLOAD_ENTRY_TYPE_FLOAT64 44 +#define NVTX_PAYLOAD_ENTRY_TYPE_FLOAT128 45 + +#define NVTX_PAYLOAD_ENTRY_TYPE_BF16 50 +#define NVTX_PAYLOAD_ENTRY_TYPE_TF32 52 + +/** + * Data types are as defined by NVTXv3 core. + */ +#define NVTX_PAYLOAD_ENTRY_TYPE_CATEGORY 68 /* uint32_t */ +#define NVTX_PAYLOAD_ENTRY_TYPE_COLOR_ARGB 69 /* uint32_t */ + +/** + * The scope of events or counters (see `nvtxScopeRegister`). + */ +#define NVTX_PAYLOAD_ENTRY_TYPE_SCOPE_ID 70 + +/** + * Process ID as scope + */ +#define NVTX_PAYLOAD_ENTRY_TYPE_PID_UINT32 71 +#define NVTX_PAYLOAD_ENTRY_TYPE_PID_UINT64 72 + +/** + * Thread ID as scope (see `nvtxGetActiveThreadId` for valid values). + */ +#define NVTX_PAYLOAD_ENTRY_TYPE_TID_UINT32 73 +#define NVTX_PAYLOAD_ENTRY_TYPE_TID_UINT64 74 + +/** + * \brief String types. + * + * If no flags are set for the entry and `arrayOrUnionDetail > 0`, the entry is + * assumed to be a fixed-size string with the given length, embedded in the payload. + * `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_FIXED_SIZE` is redundant for fixed-size strings. + * + * \todo(Revise the following paragraph.) + * Setting the flag `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_ZERO_TERMINATED` specifies a + * zero-terminated string. If `arrayOrUnionDetail > 0`, the entry is handled as + * a zero-terminated array of fixed-size strings. + * + * Setting the flag `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_LENGTH_INDEX` specifies a + * variable-length string with the length given in the entry specified by the + * field `arrayOrUnionDetail`. + */ +#define NVTX_PAYLOAD_ENTRY_TYPE_CSTRING 75 /* `char*`, system LOCALE */ +#define NVTX_PAYLOAD_ENTRY_TYPE_CSTRING_UTF8 76 +#define NVTX_PAYLOAD_ENTRY_TYPE_CSTRING_UTF16 77 +#define NVTX_PAYLOAD_ENTRY_TYPE_CSTRING_UTF32 78 + +/** + * The entry value is of type @ref nvtxStringHandle_t returned by + * @ref nvtxDomainRegisterString. + */ +#define NVTX_PAYLOAD_ENTRY_TYPE_NVTX_REGISTERED_STRING_HANDLE 80 + +/** + * This type marks the union selector member (entry index) in schemas used by + * a union with internal selector. + * See @ref NVTX_PAYLOAD_SCHEMA_TYPE_UNION_WITH_INTERNAL_SELECTOR. + */ +#define NVTX_PAYLOAD_ENTRY_TYPE_UNION_SELECTOR 100 + +/** + * \brief Predefined schema ID for payload data that is referenced in another payload. + * + * This schema ID can be used in @ref nvtxPayloadData_t::schema_id to indicate that the + * payload is a blob of memory which other payload entries may point into. + * A tool will not expose this payload directly. + * + * This schema ID cannot be used as schema entry type! + */ +#define NVTX_TYPE_PAYLOAD_SCHEMA_REFERENCED 1022 + +/** + * \brief Predefined schema ID for raw payload data. + * + * This schema ID can be used in @ref nvtxPayloadData_t::schema_id to indicate + * that the payload is a blob, which can be shown with an arbitrary data viewer. + * This schema ID cannot be used as schema entry type! + */ +#define NVTX_TYPE_PAYLOAD_SCHEMA_RAW 1023 + +/** + * \deprecated: Remove for official release! + * In the initial version of this header custom schema IDs started + * here. Unless predefined types require more than 16 bits we can keep this + * value to preserve backwards compatibility. The value is not used as first + * ID for custom schemas any more, but in the analysis every entry type >= this + * value is assumed to be a custom schema. + */ +#define NVTX_PAYLOAD_ENTRY_TYPE_CUSTOM_BASE 65536 + +/* Custom (static) schema IDs. */ +#define NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START (1 << 24) + +/* Dynamic schema IDs (generated by the tool) start here. */ +#define NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_DYNAMIC_START ((uint64_t)1 << 32) + +#endif /* NVTX_PAYLOAD_ENTRY_TYPES_V1 */ +/** --------------------------------------------------------------------------- + * END: Payload schema entry types. + * ------------------------------------------------------------------------- */ + + +#ifndef NVTX_PAYLOAD_SCHEMA_TYPES_V1 +#define NVTX_PAYLOAD_SCHEMA_TYPES_V1 + +/** + * \brief The payload schema type. + * + * A schema can be either of the following types. It is set with + * @ref nvtxPayloadSchemaAttr_t::type. + */ +#define NVTX_PAYLOAD_SCHEMA_TYPE_INVALID 0 +#define NVTX_PAYLOAD_SCHEMA_TYPE_STATIC 1 +#define NVTX_PAYLOAD_SCHEMA_TYPE_DYNAMIC 2 +#define NVTX_PAYLOAD_SCHEMA_TYPE_UNION 3 +#define NVTX_PAYLOAD_SCHEMA_TYPE_UNION_WITH_INTERNAL_SELECTOR 4 + +#endif /* NVTX_PAYLOAD_SCHEMA_TYPES_V1 */ + + +#ifndef NVTX_PAYLOAD_SCHEMA_FLAGS_V1 +#define NVTX_PAYLOAD_SCHEMA_FLAGS_V1 + +/** + * \brief Flags for static and dynamic schemas. + * + * The schema flags are used with @ref nvtxPayloadSchemaAttr_t::flags. + */ +#define NVTX_PAYLOAD_SCHEMA_FLAG_NONE 0 + +/** + * This flag indicates that a schema and the corresponding payloads can + * contain fields which require a deep copy. + */ +#define NVTX_PAYLOAD_SCHEMA_FLAG_DEEP_COPY (1 << 1) + +/** + * This flag indicates that a schema and the corresponding payload can be + * referenced by another payload of the same event. If the schema is not + * intended to be visualized directly, it is possible use + * @ref NVTX_TYPE_PAYLOAD_SCHEMA_REFERENCED instead. + */ +#define NVTX_PAYLOAD_SCHEMA_FLAG_REFERENCED (1 << 2) + +/** + * The schema defines a counter group. An NVTX handler can expect that the schema + * contains entries with counter semantics. + */ +#define NVTX_PAYLOAD_SCHEMA_FLAG_COUNTER_GROUP (1 << 3) + + +#endif /* NVTX_PAYLOAD_SCHEMA_FLAGS_V1 */ + + +#ifndef NVTX_PAYLOAD_SCHEMA_ATTRS_V1 +#define NVTX_PAYLOAD_SCHEMA_ATTRS_V1 + +/** + * The values allow the valid fields in @ref nvtxPayloadSchemaAttr_t to be + * specified via setting the field `fieldMask`. + */ +#define NVTX_PAYLOAD_SCHEMA_ATTR_NAME (1 << 1) +#define NVTX_PAYLOAD_SCHEMA_ATTR_TYPE (1 << 2) +#define NVTX_PAYLOAD_SCHEMA_ATTR_FLAGS (1 << 3) +#define NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES (1 << 4) +#define NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES (1 << 5) +#define NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE (1 << 6) +#define NVTX_PAYLOAD_SCHEMA_ATTR_ALIGNMENT (1 << 7) +#define NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID (1 << 8) +#define NVTX_PAYLOAD_SCHEMA_ATTR_EXTENSION (1 << 9) + +#endif /* NVTX_PAYLOAD_SCHEMA_ATTRS_V1 */ + + +#ifndef NVTX_PAYLOAD_ENUM_ATTRS_V1 +#define NVTX_PAYLOAD_ENUM_ATTRS_V1 + +/** + * The values are used to set the field `fieldMask` and specify which fields in + * @ref nvtxPayloadEnumAttr_t are set. + */ +#define NVTX_PAYLOAD_ENUM_ATTR_NAME (1 << 1) +#define NVTX_PAYLOAD_ENUM_ATTR_ENTRIES (1 << 2) +#define NVTX_PAYLOAD_ENUM_ATTR_NUM_ENTRIES (1 << 3) +#define NVTX_PAYLOAD_ENUM_ATTR_SIZE (1 << 4) +#define NVTX_PAYLOAD_ENUM_ATTR_SCHEMA_ID (1 << 5) +#define NVTX_PAYLOAD_ENUM_ATTR_EXTENSION (1 << 6) + +#endif /* NVTX_PAYLOAD_ENUM_ATTRS_V1 */ + +/** Deprecated NVTX scope defines. */ +#ifndef NVTX_SCOPES_V0 +#define NVTX_SCOPES_V0 + +#define NVTX_EVENT_SCOPE_INVALID 0 +#define NVTX_EVENT_SCOPE_NONE 1 /* Global/base/root or no scope */ + +/* Hardware events */ +#define NVTX_EVENT_SCOPE_HW_MACHINE 2 /* Node/machine name, Device? */ +#define NVTX_EVENT_SCOPE_HW_SOCKET 3 +#define NVTX_EVENT_SCOPE_HW_CPU 4 +#define NVTX_EVENT_SCOPE_HW_CPU_LOGICAL 5 +/* Innermost HW execution context at registration time */ +#define NVTX_EVENT_SCOPE_HW_INNERMOST 6 + +/* Virtualized hardware, virtual machines */ +#define NVTX_EVENT_SCOPE_VM 7 + +/* Software scopes */ +#define NVTX_EVENT_SCOPE_SW_PROCESS 8 /* Process scope */ +#define NVTX_EVENT_SCOPE_SW_THREAD 9 /* Thread scope */ +/* Innermost SW execution context at registration time */ +#define NVTX_EVENT_SCOPE_SW_INNERMOST 10 + +#endif /* NVTX_SCOPES_V0 */ + +/** + * An NVTX scope specifies the execution scope or source of events or counters. + * A tool determines the value for a predefined scope when the sample is taken. + */ +#ifndef NVTX_SCOPES_V1 +#define NVTX_SCOPES_V1 + +#define NVTX_SCOPE_NONE 0 /* No scope specified. */ +#define NVTX_SCOPE_ROOT 1 /* The root in a hierarchy. */ + +/* Hardware events */ +#define NVTX_SCOPE_CURRENT_HW_MACHINE 2 /* Node/machine name */ +#define NVTX_SCOPE_CURRENT_HW_SOCKET 3 +#define NVTX_SCOPE_CURRENT_HW_CPU_PHYSICAL 4 /* Physical CPU core */ +#define NVTX_SCOPE_CURRENT_HW_CPU_LOGICAL 5 /* Logical CPU core */ +/* Innermost HW execution context */ +#define NVTX_SCOPE_CURRENT_HW_INNERMOST 15 + +/* Virtualized hardware, virtual machines */ +#define NVTX_SCOPE_CURRENT_HYPERVISOR 16 +#define NVTX_SCOPE_CURRENT_VM 17 +#define NVTX_SCOPE_CURRENT_KERNEL 18 +#define NVTX_SCOPE_CURRENT_CONTAINER 19 +#define NVTX_SCOPE_CURRENT_OS 20 + +/* Software scopes */ +#define NVTX_SCOPE_CURRENT_SW_PROCESS 21 /* Process scope */ +#define NVTX_SCOPE_CURRENT_SW_THREAD 22 /* Thread scope */ +/* Innermost SW execution context */ +#define NVTX_SCOPE_CURRENT_SW_INNERMOST 31 + +/** Static (user-provided) scope IDs (feed forward) */ +#define NVTX_SCOPE_ID_STATIC_START (1 << 24) + +/* Dynamically (tool) generated scope IDs */ +#define NVTX_SCOPE_ID_DYNAMIC_START 4294967296 /* 1 << 32 */ + +#endif /* NVTX_SCOPES_V1 */ + + +#ifndef NVTX_DEFERRED_EVENTS_SORTING_V1 +#define NVTX_DEFERRED_EVENTS_SORTING_V1 +/** + * Deferred events are assumed to be in chronologically order by default. + */ +#define NVTX_DEFERRED_EVENTS_SORTED 0 +#define NVTX_DEFERRED_EVENTS_SORTED_PER_EVENT_SOURCE 1 +#define NVTX_DEFERRED_EVENTS_UNSORTED 2 + +#endif /* NVTX_DEFERRED_EVENTS_SORTING_V1 */ + + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +#ifndef NVTX_PAYLOAD_TYPEDEFS_V1 +#define NVTX_PAYLOAD_TYPEDEFS_V1 + +/** + * \brief Size and alignment information for predefined payload entry types. + * + * The struct contains the size and the alignment size in bytes. A respective + * array for the predefined types is passed via nvtxExtModuleInfo_t to the NVTX + * client/handler. The type (ID) is used as index into this array. + */ +typedef struct nvtxPayloadEntryTypeInfo_v1 +{ + uint16_t size; + uint16_t align; +} nvtxPayloadEntryTypeInfo_t; + +/** + * \brief Binary payload data, size and decoding information. + * + * An array of type `nvtxPayloadData_t` is passed to the NVTX event attached to + * an NVTX event via the `payload.ullvalue` field of NVTX event attributes. + * + * The `schemaId` be a predefined schema entry type (`NVTX_PAYLOAD_ENTRY_TYPE*`), + * a schema ID (statically specified or dynamically created) or one of + * `NVTX_PAYLOAD_TYPE_REFERENCED` or `NVTX_PAYLOAD_TYPE_RAW`. + * + * Setting the size of a payload to `MAX_SIZE` can be useful to reduce the + * overhead of NVTX instrumentation, when no NVTX handler is attached. However, + * a tool might not be able to detect the size of a payload and thus skip it. + * A reasonable use case is a payload that represents a null-terminated + * C string, where the NVTX handler can call `strlen()`. + */ +typedef struct nvtxPayloadData_v1 +{ + /** + * The schema ID, which defines the layout of the binary data. + */ + uint64_t schemaId; + + /** + * Size of the payload (blob) in bytes. `SIZE_MAX` (`-1`) indicates the tool + * that it should figure out the size, which might not be possible. + */ + size_t size; + + /** + * Pointer to the binary payload data. + */ + const void* payload; +} nvtxPayloadData_t; + + +/** + * \brief Header of the payload entry's semantic field. + * + * If the semantic field of the payload schema entry is set, the first four + * fields (header) are defined with this type. A tool can iterate through the + * extensions and check, if it supports (can handle) it. + */ +typedef struct nvtxSemanticsHeader_v1 +{ + uint32_t structSize; /** Size of semantic extension struct. */ + uint16_t semanticId; + uint16_t version; + const struct nvtxSemanticsHeader_v1* next; /** linked list */ + /* Additional fields are defined by the specific semantic extension. */ +} nvtxSemanticsHeader_t; + +/** + * \brief Entry in a schema. + * + * A payload schema consists of an array of payload schema entries. It is + * registered with @ref nvtxPayloadSchemaRegister. `flag` can be set to `0` for + * simple values, 'type' is the only "required" field. If not set explicitly, + * all other fields are zero-initialized, which means that the entry has no name + * and the offset is determined based on self-alignment rules. + * + * Example schema: + * nvtxPayloadSchemaEntry_t schema[] = { + * {0, NVTX_EXT_PAYLOAD_TYPE_UINT8, "one byte"}, + * {0, NVTX_EXT_PAYLOAD_TYPE_INT32, "four bytes"} + * }; + */ +typedef struct nvtxPayloadSchemaEntry_v1 +{ + /** + * \brief Flags to augment the basic type. + * + * This field allows additional properties of the payload entry to be + * specified. Valid values are `NVTX_PAYLOAD_ENTRY_FLAG_*`. + */ + uint64_t flags; + + /** + * \brief Predefined payload schema entry type or custom schema ID. + * + * Predefined types are `NVTX_PAYLOAD_ENTRY_TYPE_*`. Passing a schema ID + * enables nesting of schemas. + */ + uint64_t type; + + /** + * \brief Name or label of the payload entry. (Optional) + * + * A meaningful name or label can help organizing and interpreting the data. + */ + const char* name; + + /** + * \brief Description of the payload entry. (Optional) + * + * A more detail description of the data that is stored with this entry. + */ + const char* description; + + /** + * \brief String length, array length or member selector for union types. + * + * If @ref type is a C string type, this field specifies the string length. + * + * If @ref flags specify that the entry is an array, this field specifies + * the array length. See `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_*` for more details. + * + * If @ref type is a union with schema type @ref NVTX_PAYLOAD_SCHEMA_TYPE_UNION + * (external selection of the union member), this field contains the index + * (starting with 0) to an entry of integral type in the same schema. The + * associated field value specifies the selected union member. + * + * @note An array of schema type @ref NVTX_PAYLOAD_SCHEMA_TYPE_UNION is not + * supported. @ref NVTX_PAYLOAD_SCHEMA_TYPE_UNION_WITH_INTERNAL_SELECTOR can + * be used instead. + */ + uint64_t arrayOrUnionDetail; + + /** + * \brief Offset in the binary payload data (in bytes). + * + * This field specifies the byte offset from the base address of the actual + * binary data (blob) to the start address of the data of this entry. + * + * It is recommended (but not required) to provide the offset it. Otherwise, + * the NVTX handler will determine the offset from natural alignment rules. + * In some cases, e.g. dynamic schema layouts, the offset cannot be set and + * has to be determined based on the data of prior entries. + * + * Setting the offset can also be used to skip entries during payload parsing. + */ + uint64_t offset; + + /** + * \brief Additional semantics of the payload entry. + * + * The field points to the first element in a linked list, which enables + * multiple semantic extensions. + */ + const nvtxSemanticsHeader_t* semantics; + + /** + * \brief Reserved for future use. Do not use it! + */ + const void* reserved; +} nvtxPayloadSchemaEntry_t; + +/** + * \brief NVTX payload schema attributes. + */ +typedef struct nvtxPayloadSchemaAttr_v1 +{ + /** + * \brief Mask of valid fields in this struct. + * + * Use the `NVTX_PAYLOAD_SCHEMA_ATTR_*` defines. + */ + uint64_t fieldMask; + + /** + * \brief Name of the payload schema. (Optional) + */ + const char* name; + + /** + * \brief Payload schema type. (Mandatory) \anchor PAYLOAD_TYPE_FIELD + * + * Use the `NVTX_PAYLOAD_SCHEMA_TYPE_*` defines. + */ + uint64_t type; + + /** + * \brief Payload schema flags. (Optional) + * + * Flags defined by `NVTX_PAYLOAD_SCHEMA_FLAG_*` can be used to set + * additional properties of the schema. + */ + uint64_t flags; + + /** + * \brief Entries of a payload schema. (Mandatory) \anchor ENTRIES_FIELD + * + * This field is a pointer to an array of schema entries, each describing a + * field in a data structure, e.g. in a C struct or union. + */ + const nvtxPayloadSchemaEntry_t* entries; + + /** + * \brief Number of entries in the payload schema. (Mandatory) + * + * Number of entries in the array of payload entries \ref ENTRIES_FIELD. + */ + size_t numEntries; + + /** + * \brief The binary payload size in bytes for static payload schemas. + * + * If \ref PAYLOAD_TYPE_FIELD is @ref NVTX_PAYLOAD_SCHEMA_TYPE_DYNAMIC this + * value is ignored. If this field is not specified for a schema of type + * @ref NVTX_PAYLOAD_SCHEMA_TYPE_STATIC, the size can be automatically + * determined by a tool. + */ + size_t payloadStaticSize; + + /** + * \brief The byte alignment for packed structures. + * + * If not specified, this field defaults to `0`, which means that the fields + * in the data structure are not packed and natural alignment rules can be + * applied. + */ + size_t packAlign; + + /** + * Static/custom schema ID must be + * >= NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START and + * < NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_DYNAMIC_START + */ + uint64_t schemaId; + + /** + * Flexible extension for schema attributes. + * (Do not use. Reserved for future use.) + */ + void* extension; +} nvtxPayloadSchemaAttr_t; + +/** + * \brief This type is used to describe an enumeration. + * + * Since the value of an enum entry might not be meaningful for the analysis + * and/or visualization, a tool can show the name of enum entry instead. + * + * An array of this struct is passed to @ref nvtxPayloadEnumAttr_t::entries to be + * finally registered via @ref nvtxPayloadEnumRegister with the NVTX handler. + * + * @note EXPERIMENTAL + */ +typedef struct nvtxPayloadEnum_v1 +{ + /** + * Name of the enum value. + */ + const char* name; + + /** + * Value of the enum entry. + */ + uint64_t value; + + /** + * Indicates that this entry sets a specific set of bits, which can be used + * to define bitsets. + */ + int8_t isFlag; +} nvtxPayloadEnum_t; + +/** + * \brief NVTX payload enumeration type attributes. + * + * A pointer to this struct is passed to @ref nvtxPayloadEnumRegister. + */ +typedef struct nvtxPayloadEnumAttr_v1 +{ + /** + * Mask of valid fields in this struct. See `NVTX_PAYLOAD_ENUM_ATTR_*`. + */ + uint64_t fieldMask; + + /** + * Name of the enum. (Optional) + */ + const char* name; + + /** + * Entries of the enum. (Mandatory) + */ + const nvtxPayloadEnum_t* entries; + + /** + * Number of entries in the enum. (Mandatory) + */ + size_t numEntries; + + /** + * Size of enumeration type in bytes + */ + size_t sizeOfEnum; + + /** + * Static/custom schema ID must be + * >= NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START and + * < NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_DYNAMIC_START + */ + uint64_t schemaId; + + /** + * Flexible extension for enumeration attributes. + * (Do not use. Reserved for future use.) + */ + void* extension; +} nvtxPayloadEnumAttr_t; + +typedef struct nvtxScopeAttr_v1 +{ + size_t structSize; + + /** + * Path delimited by '/' characters, relative to parentScope. Leading + * slashes are ignored. Nodes in the path may use name[key] syntax to + * indicate an array of sibling nodes, which may be combined with other + * non-array nodes or different arrays at the same scope. Node names should + * be UTF8 printable characters. '\' has to be used to escape '/', '[', and + * ']' characters in node names. An empty C string "" and `NULL` are valid + * inputs and treated equivalently. + */ + const char* path; + + uint64_t parentScope; + + /** + * The static scope ID must be unique within the domain, + * >= NVTX_SCOPE_ID_STATIC_START, and + * < NVTX_SCOPE_ID_DYNAMIC_START. + */ + uint64_t scopeId; +} nvtxScopeAttr_t; + +#endif /* NVTX_PAYLOAD_TYPEDEFS_V1 */ + +#ifndef NVTX_PAYLOAD_API_FUNCTIONS_V1 +#define NVTX_PAYLOAD_API_FUNCTIONS_V1 + +/** + * \brief Register a payload schema. + * + * @param domain NVTX domain handle. + * @param attr NVTX payload schema attributes. + */ +NVTX_DECLSPEC uint64_t NVTX_API nvtxPayloadSchemaRegister( + nvtxDomainHandle_t domain, + const nvtxPayloadSchemaAttr_t* attr); + +/** + * \brief Register an enumeration type with the payload extension. + * + * @param domain NVTX domain handle + * @param attr NVTX payload enumeration type attributes. + */ +NVTX_DECLSPEC uint64_t NVTX_API nvtxPayloadEnumRegister( + nvtxDomainHandle_t domain, + const nvtxPayloadEnumAttr_t* attr); + +/** + * \brief Register a scope. + * + * @param domain NVTX domain handle (0 for default domain) + * @param attr Scope attributes. + * + * @return an identifier for the scope. If the operation was not successful, + * `NVTX_SCOPE_NONE` is returned. + */ +NVTX_DECLSPEC uint64_t NVTX_API nvtxScopeRegister( + nvtxDomainHandle_t domain, + const nvtxScopeAttr_t* attr); + +/** + * \brief Marks an instantaneous event in the application with the attributes + * being passed via the extended payload. + * + * An NVTX handler can assume that the payload contains the event message. + * Otherwise, it might ignore the event. + * + * @param domain NVTX domain handle + * @param payloadData pointer to an array of structured payloads. + * @param count number of payload BLOBs. + */ +NVTX_DECLSPEC void NVTX_API nvtxMarkPayload( + nvtxDomainHandle_t domain, + const nvtxPayloadData_t* payloadData, + size_t count); + +/** + * \brief Begin a nested thread range with the attributes being passed via the + * payload. + * + * @param domain NVTX domain handle + * @param payloadData pointer to an array of structured payloads. + * @param count number of payload BLOBs. + * + * @return The level of the range being ended. If an error occurs a negative + * value is returned on the current thread. + */ +NVTX_DECLSPEC int NVTX_API nvtxRangePushPayload( + nvtxDomainHandle_t domain, + const nvtxPayloadData_t* payloadData, + size_t count); + +/** + * \brief End a nested thread range with an additional custom payload. + * + * NVTX event attributes passed to this function (via the payloads) overwrite + * event attributes (message and color) that have been set in the push event. + * Other payload entries extend the data of the range. + * + * @param domain NVTX domain handle + * @param payloadData pointer to an array of structured payloads. + * @param count number of payload BLOBs. + * + * @return The level of the range being ended. If an error occurs a negative + * value is returned on the current thread. + */ +NVTX_DECLSPEC int NVTX_API nvtxRangePopPayload( + nvtxDomainHandle_t domain, + const nvtxPayloadData_t* payloadData, + size_t count); + +/** + * \brief Start a thread range with attributes passed via the extended payload. + * + * @param domain NVTX domain handle + * @param payloadData pointer to an array of structured payloads. + * @param count number of payload BLOBs. + * + * @return The level of the range being ended. If an error occurs a negative + * value is returned on the current thread. + */ +NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartPayload( + nvtxDomainHandle_t domain, + const nvtxPayloadData_t* payloadData, + size_t count); + +/** + * \brief End a thread range and pass a custom payload. + * + * NVTX event attributes passed to this function (via the payloads) overwrite + * event attributes (message and color) that have been set in the start event. + * Other payload entries extend the data of the range. + * + * @param domain NVTX domain handle + * @param id The correlation ID returned from a NVTX range start call. + * @param payloadData pointer to an array of structured payloads. + * @param count number of payload BLOBs. + */ +NVTX_DECLSPEC void NVTX_API nvtxRangeEndPayload( + nvtxDomainHandle_t domain, + nvtxRangeId_t id, + const nvtxPayloadData_t* payloadData, + size_t count); + +/** + * @brief Checks if an NVTX domain is enabled. + * + * @param domain NVTX domain handle + * @return 0 if the domain is not enabled. + */ +NVTX_DECLSPEC uint8_t NVTX_API nvtxDomainIsEnabled( + nvtxDomainHandle_t domain); + +/** + * \brief Report a push-pop range in a single call. + * \category NsysInternal + * + * This function is called at range pop. Thus, the NVTX handler will immediately + * take a timestamp (if timing is desired). The timestamp of the push operation + * is passed as argument and can be retrieved via `nvtxTimestampGet()`. + * + * The NVTX handler can assume that no other push operation happend in the same + * domain in between the push and the pop time of the reported range. + * + * @param domain The domain of scoping. + * @param eventAttrib The event attribute structure defining the range's + * attribute types and attribute values. + * @param pushTime The timestamp of the push operation (use `nvtxTimestampGet()`). + */ +NVTX_DECLSPEC void NVTX_API nvtxRangePushPop(nvtxDomainHandle_t domain, + const nvtxEventAttributes_t* eventAttrib, uint64_t pushTime); + +/** + * \brief Get a timestamp from the attached NVTX handler/tool. + * + * The timestamp is intended to be passed ... + * The time source is assumed to be TSC. + */ +NVTX_DECLSPEC int64_t NVTX_API nvtxTimestampGet(void); + +#endif /* NVTX_PAYLOAD_API_FUNCTIONS_V1 */ + +#ifndef NVTX_PAYLOAD_CALLBACK_ID_V1 +#define NVTX_PAYLOAD_CALLBACK_ID_V1 +/** + * \brief Callback Ids of API functions in the payload extension. + * + * The NVTX handler can use these values to register a handler function. When + * InitializeInjectionNvtxExtension(nvtxExtModuleInfo_t* moduleInfo) is + * executed, a handler routine 'handlenvtxPayloadRegisterSchema' can be + * registered as follows: + * \code{.c} + * moduleInfo->segments->slots[NVTX3EXT_CBID_nvtxPayloadSchemaRegister] = + * (intptr_t)YourPayloadRegisterSchemaHandlerFn; + * \endcode + */ +#define NVTX3EXT_CBID_nvtxPayloadSchemaRegister 0 +#define NVTX3EXT_CBID_nvtxPayloadEnumRegister 1 +#define NVTX3EXT_CBID_nvtxMarkPayload 2 +#define NVTX3EXT_CBID_nvtxRangePushPayload 3 +#define NVTX3EXT_CBID_nvtxRangePopPayload 4 +#define NVTX3EXT_CBID_nvtxRangeStartPayload 5 +#define NVTX3EXT_CBID_nvtxRangeEndPayload 6 +#define NVTX3EXT_CBID_nvtxDomainIsEnabled 7 +#define NVTX3EXT_CBID_nvtxTimestampGet 8 +#define NVTX3EXT_CBID_nvtxScopeRegister 12 + +/* NSys internal use only. */ +#define NVTX3EXT_CBID_nvtxRangePushPop 62 +#endif /* NVTX_PAYLOAD_CALLBACK_ID_V1 */ + +/*** Helper utilities ***/ + +/** \brief Helper macro for safe double-cast of pointer to uint64_t value. */ +#ifndef NVTX_POINTER_AS_PAYLOAD_ULLVALUE +# ifdef __cplusplus +# define NVTX_POINTER_AS_PAYLOAD_ULLVALUE(p) \ + static_cast(reinterpret_cast(p)) +# else +#define NVTX_POINTER_AS_PAYLOAD_ULLVALUE(p) ((uint64_t)(uintptr_t)p) +# endif +#endif + +#ifndef NVTX_PAYLOAD_EVTATTR_SET_DATA +/** + * \brief Helper macro to attach a single payload to an NVTX event attribute. + * + * @param evtAttr NVTX event attribute (variable name) + * @param pldata_addr Adress of `nvtxPayloadData_t` variable. + * @param schema_id NVTX binary payload schema ID. + * @param pl_addr Address of the (actual) payload. + * @param sz size of the (actual) payload. + */ +#define NVTX_PAYLOAD_EVTATTR_SET_DATA(evtAttr, pldata_addr, schema_id, pl_addr, sz) \ + (pldata_addr)->schemaId = schema_id; \ + (pldata_addr)->size = sz; \ + (pldata_addr)->payload = pl_addr; \ + (evtAttr).payload.ullValue = NVTX_POINTER_AS_PAYLOAD_ULLVALUE(pldata_addr); \ + (evtAttr).payloadType = NVTX_PAYLOAD_TYPE_EXT; \ + (evtAttr).reserved0 = 1; +#endif /* NVTX_PAYLOAD_EVTATTR_SET_DATA */ + +#ifndef NVTX_PAYLOAD_EVTATTR_SET_MULTIPLE +/** + * \brief Helper macro to attach multiple payloads to an NVTX event attribute. + * + * @param evtAttr NVTX event attribute (variable name) + * @param pldata Payload data array (of type `nvtxPayloadData_t`) + */ +#define NVTX_PAYLOAD_EVTATTR_SET_MULTIPLE(evtAttr, pldata) \ + (evtAttr).payloadType = NVTX_PAYLOAD_TYPE_EXT; \ + (evtAttr).reserved0 = sizeof(pldata)/sizeof(nvtxPayloadData_t); \ + (evtAttr).payload.ullValue = NVTX_POINTER_AS_PAYLOAD_ULLVALUE(pldata); +#endif /* NVTX_PAYLOAD_EVTATTR_SET_MULTIPLE */ + +#ifndef NVTX_PAYLOAD_EVTATTR_SET +/* + * Do not use this macro directly! It is a helper to attach a single payload to + * an NVTX event attribute. + * @warning The NVTX push, start or mark operation must not be in an outer scope. + */ +#define NVTX_PAYLOAD_EVTATTR_SET(evtAttr, schema_id, pl_addr, sz) \ + nvtxPayloadData_t _NVTX_PAYLOAD_DATA_VAR[] = \ + {{schema_id, sz, pl_addr}}; \ + (evtAttr)->payload.ullValue = \ + NVTX_POINTER_AS_PAYLOAD_ULLVALUE(_NVTX_PAYLOAD_DATA_VAR); \ + (evtAttr)->payloadType = NVTX_PAYLOAD_TYPE_EXT; \ + (evtAttr)->reserved0 = 1; +#endif /* NVTX_PAYLOAD_EVTATTR_SET */ + +#ifndef nvtxPayloadRangePush +/** + * \brief Helper macro to push a range with extended payload. + * + * @param domain NVTX domain handle (0 for default domain) + * @param evtAttr pointer to NVTX event attribute. + * @param schemaId NVTX payload schema ID + * @param plAddr Pointer to the binary data (actual payload) + * @param size Size of the binary payload data in bytes. + */ +#define nvtxPayloadRangePush(domain, evtAttr, schemaId, plAddr, size) \ +do { \ + NVTX_PAYLOAD_EVTATTR_SET(evtAttr, schemaId, plAddr, size) \ + nvtxDomainRangePushEx(domain, evtAttr); \ +} while (0) +#endif /* nvtxPayloadRangePush */ + +#ifndef nvtxPayloadMark +/** + * \brief Helper macro to set a marker with extended payload. + * + * @param domain NVTX domain handle (0 for default domain) + * @param evtAttr pointer to NVTX event attribute. + * @param schemaId NVTX payload schema ID + * @param plAddr Pointer to the binary data (actual payload) + * @param size Size of the binary payload data in bytes. + */ +#define nvtxPayloadMark(domain, evtAttr, schemaId, plAddr, size) \ +do { \ + NVTX_PAYLOAD_EVTATTR_SET(evtAttr, schemaId, plAddr, size) \ + nvtxDomainMarkEx(domain, evtAttr); \ +} while (0) +#endif /* nvtxPayloadMark */ + +#ifdef __GNUC__ +#pragma GCC visibility push(internal) +#endif + +/* Extension types are required for the implementation and the NVTX handler. */ +#define NVTX_EXT_TYPES_GUARD +#include "nvtxExtDetail/nvtxExtTypes.h" +#undef NVTX_EXT_TYPES_GUARD + +#ifndef NVTX_NO_IMPL +#define NVTX_EXT_IMPL_PAYLOAD_GUARD +#include "nvtxExtDetail/nvtxExtImplPayload_v1.h" +#undef NVTX_EXT_IMPL_PAYLOAD_GUARD +#endif /* NVTX_NO_IMPL */ + +#ifdef __GNUC__ +#pragma GCC visibility pop +#endif + +#ifdef __cplusplus +} +#endif /* __cplusplus */ diff --git a/src/main/cpp/profiler/nvtx3/nvtxExtDetail/nvtxExtImpl.h b/src/main/cpp/profiler/nvtx3/nvtxExtDetail/nvtxExtImpl.h new file mode 100644 index 0000000000..dd215a35c6 --- /dev/null +++ b/src/main/cpp/profiler/nvtx3/nvtxExtDetail/nvtxExtImpl.h @@ -0,0 +1,102 @@ +/* +* Copyright 2009-2020 NVIDIA Corporation. All rights reserved. +* +* Licensed under the Apache License v2.0 with LLVM Exceptions. +* See https://llvm.org/LICENSE.txt for license information. +* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +*/ + +#ifndef NVTX_EXT_IMPL_GUARD +#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined). +#endif + +#ifndef NVTX_EXT_IMPL_H +#define NVTX_EXT_IMPL_H +/* ---- Include required platform headers ---- */ + +#if defined(_WIN32) + +#include + +#else +#include + +#if defined(__ANDROID__) +#include +#endif + +#if defined(__linux__) || defined(__CYGWIN__) +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#endif + +/* ---- Define macros used in this file ---- */ + +#ifdef NVTX_DEBUG_PRINT +#ifdef __ANDROID__ +#include +#define NVTX_ERR(...) __android_log_print(ANDROID_LOG_ERROR, "NVTOOLSEXT", __VA_ARGS__); +#define NVTX_INFO(...) __android_log_print(ANDROID_LOG_INFO, "NVTOOLSEXT", __VA_ARGS__); +#else +#include +#define NVTX_ERR(...) fprintf(stderr, "NVTX_ERROR: " __VA_ARGS__) +#define NVTX_INFO(...) fprintf(stderr, "NVTX_INFO: " __VA_ARGS__) +#endif +#else /* !defined(NVTX_DEBUG_PRINT) */ +#define NVTX_ERR(...) +#define NVTX_INFO(...) +#endif + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ +/* +#ifdef __GNUC__ +#pragma GCC visibility push(hidden) +#endif +*/ +#define NVTX_EXTENSION_FRESH 0 +#define NVTX_EXTENSION_DISABLED 1 +#define NVTX_EXTENSION_STARTING 2 +#define NVTX_EXTENSION_LOADED 3 + +/* Function slots are local to each extension now! */ +typedef struct nvtxExtGlobals1_t +{ + NvtxExtInitializeInjectionFunc_t injectionFnPtr; +} nvtxExtGlobals1_t; + +NVTX_LINKONCE_DEFINE_GLOBAL nvtxExtGlobals1_t NVTX_VERSIONED_IDENTIFIER(nvtxExtGlobals1) = +{ + (NvtxExtInitializeInjectionFunc_t)0 +}; + +#define NVTX_EXT_INIT_GUARD +#include "nvtxExtInit.h" +#undef NVTX_EXT_INIT_GUARD +/* +#ifdef __GNUC__ +#pragma GCC visibility pop +#endif +*/ +#ifdef __cplusplus +} /* extern "C" */ +#endif /* __cplusplus */ + +#endif /* NVTX_EXT_IMPL_H */ \ No newline at end of file diff --git a/src/main/cpp/profiler/nvtx3/nvtxExtDetail/nvtxExtImplPayload_v1.h b/src/main/cpp/profiler/nvtx3/nvtxExtDetail/nvtxExtImplPayload_v1.h new file mode 100644 index 0000000000..a97810ed6a --- /dev/null +++ b/src/main/cpp/profiler/nvtx3/nvtxExtDetail/nvtxExtImplPayload_v1.h @@ -0,0 +1,180 @@ +/* +* Copyright 2021-2023 NVIDIA Corporation. All rights reserved. +* +* Licensed under the Apache License v2.0 with LLVM Exceptions. +* See https://llvm.org/LICENSE.txt for license information. +* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +*/ + +#ifndef NVTX_EXT_IMPL_PAYLOAD_GUARD +#error Never include this file directly -- it is automatically included by nvToolsExtPayload.h (except when NVTX_NO_IMPL is defined). +#endif + +#define NVTX_EXT_IMPL_GUARD +#include "nvtxExtImpl.h" +#undef NVTX_EXT_IMPL_GUARD + +#ifndef NVTX_EXT_IMPL_PAYLOAD_V1 +#define NVTX_EXT_IMPL_PAYLOAD_V1 + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +/* Macros to create versioned symbols. */ +#define NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) \ + NAME##_v##VERSION##_bpl##COMPATID +#define NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L2(NAME, VERSION, COMPATID) \ + NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) +#define NVTX_EXT_PAYLOAD_VERSIONED_ID(NAME) \ + NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L2(NAME, NVTX_VERSION, NVTX_EXT_PAYLOAD_COMPATID) + +#ifdef NVTX_DISABLE + +#include "nvtxExtHelperMacros.h" + +#define NVTX_EXT_PAYLOAD_IMPL_FN_V1(ret_val, fn_name, signature, arg_names) \ +ret_val fn_name signature { \ + NVTX_EXT_HELPER_UNUSED_ARGS arg_names \ + return ((ret_val)(intptr_t)-1); \ +} + +#else /* NVTX_DISABLE */ + +#include "nvtxExtPayloadTypeInfo.h" + +/* + * Function slots for the payload extension. First entry is the module state, + * initialized to `0` (`NVTX_EXTENSION_FRESH`). + */ +#define NVTX_EXT_PAYLOAD_SLOT_COUNT 63 +NVTX_LINKONCE_DEFINE_GLOBAL intptr_t +NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX_EXT_PAYLOAD_SLOT_COUNT + 1] + = {0}; + +/* Avoid warnings about missing prototype. */ +NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)(void); +NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)() +{ + intptr_t* fnSlots = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots) + 1; + nvtxExtModuleSegment_t segment = { + 0, /* unused (only one segment) */ + NVTX_EXT_PAYLOAD_SLOT_COUNT, + fnSlots + }; + + nvtxExtModuleInfo_t module = { + NVTX_VERSION, sizeof(nvtxExtModuleInfo_t), + NVTX_EXT_PAYLOAD_MODULEID, NVTX_EXT_PAYLOAD_COMPATID, + 1, &segment, /* number of segments, segments */ + NULL, /* no export function needed */ + /* bake type sizes and alignment information into program binary */ + &(NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadTypeInfo)) + }; + + NVTX_INFO( "%s\n", __FUNCTION__ ); + + NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce)(&module, + NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)); +} + +#define NVTX_EXT_PAYLOAD_IMPL_FN_V1(ret_type, fn_name, signature, arg_names) \ +typedef ret_type (*fn_name##_impl_fntype)signature; \ +NVTX_DECLSPEC ret_type NVTX_API fn_name signature { \ + intptr_t slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \ + if (slot != NVTX_EXTENSION_DISABLED) { \ + if (slot != NVTX_EXTENSION_FRESH) { \ + return (*(fn_name##_impl_fntype)slot) arg_names; \ + } else { \ + NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)(); \ + /* Re-read function slot after extension initialization. */ \ + slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \ + if (slot != NVTX_EXTENSION_DISABLED && slot != NVTX_EXTENSION_FRESH) { \ + return (*(fn_name##_impl_fntype)slot) arg_names; \ + } \ + } \ + } \ + NVTX_EXT_FN_RETURN_INVALID(ret_type) \ +} + +#define NVTX_EXT_PAYLOAD_IMPL_FN_V1_VOID(fn_name, signature, arg_names) \ +typedef void (*fn_name##_impl_fntype)signature; \ +NVTX_DECLSPEC void NVTX_API fn_name signature { \ + intptr_t slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \ + if (slot != NVTX_EXTENSION_DISABLED) { \ + if (slot != NVTX_EXTENSION_FRESH) { \ + (*(fn_name##_impl_fntype)slot) arg_names; \ + } else { \ + NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)(); \ + /* Re-read function slot after extension initialization. */ \ + slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \ + if (slot != NVTX_EXTENSION_DISABLED && slot != NVTX_EXTENSION_FRESH) { \ + (*(fn_name##_impl_fntype)slot) arg_names; \ + } \ + } \ + } \ +} + +#endif /*NVTX_DISABLE*/ + +/* Non-void functions. */ +#define NVTX_EXT_FN_RETURN_INVALID(rtype) return ((rtype)(intptr_t)-1); + +NVTX_EXT_PAYLOAD_IMPL_FN_V1(uint64_t, nvtxPayloadSchemaRegister, + (nvtxDomainHandle_t domain, const nvtxPayloadSchemaAttr_t* attr), + (domain, attr)) + +NVTX_EXT_PAYLOAD_IMPL_FN_V1(uint64_t, nvtxPayloadEnumRegister, + (nvtxDomainHandle_t domain, const nvtxPayloadEnumAttr_t* attr), + (domain, attr)) + +NVTX_EXT_PAYLOAD_IMPL_FN_V1(int, nvtxRangePushPayload, + (nvtxDomainHandle_t domain, const nvtxPayloadData_t* payloadData, size_t count), + (domain, payloadData, count)) + +NVTX_EXT_PAYLOAD_IMPL_FN_V1(int, nvtxRangePopPayload, + (nvtxDomainHandle_t domain, const nvtxPayloadData_t* payloadData, size_t count), + (domain, payloadData, count)) + +NVTX_EXT_PAYLOAD_IMPL_FN_V1(nvtxRangeId_t, nvtxRangeStartPayload, + (nvtxDomainHandle_t domain, const nvtxPayloadData_t* payloadData, size_t count), + (domain, payloadData, count)) + +NVTX_EXT_PAYLOAD_IMPL_FN_V1(uint8_t, nvtxDomainIsEnabled, (nvtxDomainHandle_t domain), (domain)) + +/* Experimental */ +NVTX_EXT_PAYLOAD_IMPL_FN_V1(int64_t, nvtxTimestampGet, (void), ()) + +NVTX_EXT_PAYLOAD_IMPL_FN_V1(uint64_t, nvtxScopeRegister, (nvtxDomainHandle_t domain, + const nvtxScopeAttr_t* attr), (domain, attr)) + +#undef NVTX_EXT_FN_RETURN_INVALID +/* END: Non-void functions. */ + +/* void functions. */ +#define NVTX_EXT_FN_RETURN_INVALID(rtype) +#define return + +NVTX_EXT_PAYLOAD_IMPL_FN_V1_VOID(nvtxMarkPayload, (nvtxDomainHandle_t domain, + const nvtxPayloadData_t* payloadData, size_t count), (domain, payloadData, count)) + +NVTX_EXT_PAYLOAD_IMPL_FN_V1_VOID(nvtxRangeEndPayload, (nvtxDomainHandle_t domain, + nvtxRangeId_t id, const nvtxPayloadData_t* payloadData, size_t count), + (domain, id, payloadData, count)) + +#undef return +#undef NVTX_EXT_FN_RETURN_INVALID +/* END: void functions. */ + +NVTX_EXT_PAYLOAD_IMPL_FN_V1_VOID(nvtxRangePushPop, (nvtxDomainHandle_t domain, + const nvtxEventAttributes_t* evtAttr, uint64_t pushTime), + (domain, evtAttr, pushTime)) + +/* Keep NVTX_EXT_PAYLOAD_IMPL_FN_V1 defined for a future version of this extension. */ + +#ifdef __cplusplus +} /* extern "C" */ +#endif /* __cplusplus */ + +#endif /* NVTX_EXT_IMPL_PAYLOAD_V1 */ + diff --git a/src/main/cpp/profiler/nvtx3/nvtxExtDetail/nvtxExtInit.h b/src/main/cpp/profiler/nvtx3/nvtxExtDetail/nvtxExtInit.h new file mode 100644 index 0000000000..743e55b938 --- /dev/null +++ b/src/main/cpp/profiler/nvtx3/nvtxExtDetail/nvtxExtInit.h @@ -0,0 +1,378 @@ +/* +* Copyright 2009-2023 NVIDIA Corporation. All rights reserved. +* +* Licensed under the Apache License v2.0 with LLVM Exceptions. +* See https://llvm.org/LICENSE.txt for license information. +* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +*/ + +#ifndef NVTX_EXT_INIT_GUARD +#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined). +#endif + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +/* ---- Platform-independent helper definitions and functions ---- */ + +/* Prefer macros over inline functions to reduce symbol resolution at link time */ + +#if defined(_WIN32) +#define NVTX_PATHCHAR wchar_t +#define NVTX_STR(x) L##x +#define NVTX_GETENV _wgetenv +#define NVTX_BUFSIZE MAX_PATH +#define NVTX_DLLHANDLE HMODULE +#define NVTX_DLLOPEN(x) LoadLibraryW(x) +#define NVTX_DLLFUNC GetProcAddress +#define NVTX_DLLCLOSE FreeLibrary +#define NVTX_YIELD() SwitchToThread() +#define NVTX_MEMBAR() MemoryBarrier() +#define NVTX_ATOMIC_WRITE_32(address, value) InterlockedExchange((volatile LONG*)address, value) +#define NVTX_ATOMIC_CAS_32(old, address, exchange, comparand) old = InterlockedCompareExchange((volatile LONG*)address, exchange, comparand) +#define NVTX_ATOMIC_WRITE_PTR(address, value) InterlockedExchangePointer((volatile PVOID*)address, (PVOID)value) +#define NVTX_ATOMIC_CAS_PTR(old, address, exchange, comparand) old = (intptr_t)InterlockedCompareExchangePointer((volatile PVOID*)address, (PVOID)exchange, (PVOID)comparand) + + +#elif defined(__GNUC__) +#define NVTX_PATHCHAR char +#define NVTX_STR(x) x +#define NVTX_GETENV getenv +#define NVTX_BUFSIZE PATH_MAX +#define NVTX_DLLHANDLE void* +#define NVTX_DLLOPEN(x) dlopen(x, RTLD_LAZY) +#define NVTX_DLLFUNC dlsym +#define NVTX_DLLCLOSE dlclose +#define NVTX_YIELD() sched_yield() +#define NVTX_MEMBAR() __sync_synchronize() +/* Ensure full memory barrier for atomics, to match Windows functions. */ +#define NVTX_ATOMIC_WRITE_32(address, value) __sync_synchronize(); __sync_lock_test_and_set(address, value) +#define NVTX_ATOMIC_CAS_32(old, address, exchange, comparand) __sync_synchronize(); old = __sync_val_compare_and_swap(address, exchange, comparand) +#define NVTX_ATOMIC_WRITE_PTR(address, value) __sync_synchronize(); __sync_lock_test_and_set(address, value) +#define NVTX_ATOMIC_CAS_PTR(old, address, exchange, comparand) __sync_synchronize(); old = __sync_val_compare_and_swap(address, exchange, comparand) +#else +#error The library does not support your configuration! +#endif + +/* Define this to 1 for platforms that where pre-injected libraries can be discovered. */ +#if defined(_WIN32) +/* TODO */ +#define NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY 0 +#else +#define NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY 0 +#endif + +/* Define this to 1 for platforms that support environment variables. */ +/* TODO: Detect UWP, a.k.a. Windows Store app, and set this to 0. */ +/* Try: #if defined(WINAPI_FAMILY_PARTITION) && WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) */ +#define NVTX_SUPPORT_ENV_VARS 1 + +/* Define this to 1 for platforms that support dynamic/shared libraries */ +#define NVTX_SUPPORT_DYNAMIC_INJECTION_LIBRARY 1 + +/* Injection libraries implementing InitializeInjectionNvtxExtension may be statically linked, + * which will override any dynamic injection. This is useful for platforms, where dynamic + * injection is not available. Since weak symbols, not explicitly marked extern, are + * guaranteed to be initialized to zero, if no definitions are found by the linker, the + * dynamic injection process proceeds normally, if pfnInitializeInjectionNvtx2 is 0. */ +#if defined(__GNUC__) && !defined(_WIN32) && !defined(__CYGWIN__) +#define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 1 +/* To statically inject an NVTX library, define InitializeInjectionNvtxExtension_fnptr as a normal + * symbol (not weak) pointing to the implementation of InitializeInjectionNvtxExtension, which + * does not need to be named "InitializeInjectionNvtxExtension" as it is necessary in a dynamic + * injection library. */ +__attribute__((weak)) NvtxExtInitializeInjectionFunc_t InitializeInjectionNvtxExtension_fnptr; +#else +#define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 0 +#endif + + + +/* This function tries to find or load an NVTX injection library and get the address of its + * `InitializeInjectionExtension` function. If such a function pointer is found, it is called and + * passed the address of this NVTX instance's `nvtxGetExportTable` function, so that the injection + * can attach to this instance. + * If the initialization fails for any reason, any dynamic library loaded will be freed, and all + * NVTX implementation functions will be set to no-ops. If the initialization succeeds, NVTX + * functions that are not attached to the tool will be set to no-ops. This is implemented as one + * function instead of several small functions to minimize the number of weak symbols the linker + * must resolve. The order of search is: + * 1) Pre-injected library exporting InitializeInjectionNvtxExtension + * 2) Loadable library exporting InitializeInjectionNvtxExtension + * - Path specified by env var NVTX_INJECTION??_PATH (?? is 32 or 64) + * - On Android, libNvtxInjection??.so within the package (?? is 32 or 64) + * 3) Statically-linked injection library defining InitializeInjectionNvtx2_fnptr + */ +NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)( + NvtxExtInitializeInjectionFunc_t* out_init_fnptr); +NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)( + NvtxExtInitializeInjectionFunc_t* out_init_fnptr) +{ + const char* const initFuncName = "InitializeInjectionNvtxExtension"; + NvtxExtInitializeInjectionFunc_t init_fnptr = (NvtxExtInitializeInjectionFunc_t)0; + NVTX_DLLHANDLE injectionLibraryHandle = (NVTX_DLLHANDLE)0; + + if (out_init_fnptr) + { + *out_init_fnptr = (NvtxExtInitializeInjectionFunc_t)0; + } + +#if NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY + /* Use POSIX global symbol chain to query for init function from any module. */ + init_fnptr = (NvtxExtInitializeInjectionFunc_t)NVTX_DLLFUNC(0, initFuncName); +#endif + +#if NVTX_SUPPORT_DYNAMIC_INJECTION_LIBRARY + /* Try discovering dynamic injection library to load */ + if (!init_fnptr) + { +#if NVTX_SUPPORT_ENV_VARS + /* If env var NVTX_INJECTION64_PATH is set, it should contain the path + to a 64-bit dynamic NVTX injection library (and similar for 32-bit). */ + const NVTX_PATHCHAR* const nvtxEnvVarName = (sizeof(void*) == 4) + ? NVTX_STR("NVTX_INJECTION32_PATH") + : NVTX_STR("NVTX_INJECTION64_PATH"); +#endif /* NVTX_SUPPORT_ENV_VARS */ + NVTX_PATHCHAR injectionLibraryPathBuf[NVTX_BUFSIZE]; + const NVTX_PATHCHAR* injectionLibraryPath = (const NVTX_PATHCHAR*)0; + + /* Refer to this variable explicitly in case all references to it are #if'ed out. */ + (void)injectionLibraryPathBuf; + +#if NVTX_SUPPORT_ENV_VARS + /* Disable the warning for getenv & _wgetenv -- this usage is safe because + these functions are not called again before using the returned value. */ +#if defined(_MSC_VER) +#pragma warning( push ) +#pragma warning( disable : 4996 ) +#endif + injectionLibraryPath = NVTX_GETENV(nvtxEnvVarName); +#if defined(_MSC_VER) +#pragma warning( pop ) +#endif +#endif + +#if defined(__ANDROID__) + if (!injectionLibraryPath) + { + const char *bits = (sizeof(void*) == 4) ? "32" : "64"; + char cmdlineBuf[32]; + char pkgName[PATH_MAX]; + int count; + int pid; + FILE *fp; + size_t bytesRead; + size_t pos; + + pid = (int)getpid(); + count = snprintf(cmdlineBuf, sizeof(cmdlineBuf), "/proc/%d/cmdline", pid); + if (count <= 0 || count >= (int)sizeof(cmdlineBuf)) + { + NVTX_ERR("Path buffer too small for: /proc/%d/cmdline\n", pid); + return NVTX_ERR_INIT_ACCESS_LIBRARY; + } + + fp = fopen(cmdlineBuf, "r"); + if (!fp) + { + NVTX_ERR("File couldn't be opened: %s\n", cmdlineBuf); + return NVTX_ERR_INIT_ACCESS_LIBRARY; + } + + bytesRead = fread(pkgName, 1, sizeof(pkgName) - 1, fp); + fclose(fp); + if (bytesRead == 0) + { + NVTX_ERR("Package name couldn't be read from file: %s\n", cmdlineBuf); + return NVTX_ERR_INIT_ACCESS_LIBRARY; + } + + pkgName[bytesRead] = 0; + + /* String can contain colon as a process separator. In this case the + package name is before the colon. */ + pos = 0; + while (pos < bytesRead && pkgName[pos] != ':' && pkgName[pos] != '\0') + { + ++pos; + } + pkgName[pos] = 0; + + count = snprintf(injectionLibraryPathBuf, NVTX_BUFSIZE, "/data/data/%s/files/libNvtxInjection%s.so", pkgName, bits); + if (count <= 0 || count >= NVTX_BUFSIZE) + { + NVTX_ERR("Path buffer too small for: /data/data/%s/files/libNvtxInjection%s.so\n", pkgName, bits); + return NVTX_ERR_INIT_ACCESS_LIBRARY; + } + + /* On Android, verify path is accessible due to aggressive file access restrictions. */ + /* For dlopen, if the filename contains a leading slash, then it is interpreted as a */ + /* relative or absolute pathname; otherwise it will follow the rules in ld.so. */ + if (injectionLibraryPathBuf[0] == '/') + { +#if (__ANDROID_API__ < 21) + int access_err = access(injectionLibraryPathBuf, F_OK | R_OK); +#else + int access_err = faccessat(AT_FDCWD, injectionLibraryPathBuf, F_OK | R_OK, 0); +#endif + if (access_err != 0) + { + NVTX_ERR("Injection library path wasn't accessible [code=%s] [path=%s]\n", strerror(errno), injectionLibraryPathBuf); + return NVTX_ERR_INIT_ACCESS_LIBRARY; + } + } + injectionLibraryPath = injectionLibraryPathBuf; + } +#endif + + /* At this point, `injectionLibraryPath` is specified if a dynamic + injection library was specified by a tool. */ + if (injectionLibraryPath) + { + /* Load the injection library */ + injectionLibraryHandle = NVTX_DLLOPEN(injectionLibraryPath); + if (!injectionLibraryHandle) + { + NVTX_ERR("Failed to load injection library\n"); + return NVTX_ERR_INIT_LOAD_LIBRARY; + } + else + { + /* Attempt to get the injection library's entry-point. */ + init_fnptr = (NvtxExtInitializeInjectionFunc_t)NVTX_DLLFUNC(injectionLibraryHandle, initFuncName); + if (!init_fnptr) + { + NVTX_DLLCLOSE(injectionLibraryHandle); + NVTX_ERR("Failed to get address of function %s from injection library\n", initFuncName); + return NVTX_ERR_INIT_MISSING_LIBRARY_ENTRY_POINT; + } + } + } + } +#endif + +#if NVTX_SUPPORT_STATIC_INJECTION_LIBRARY + if (!init_fnptr) + { + /* Check weakly-defined function pointer. A statically-linked injection can define + this as a normal symbol and it will take precedence over a dynamic injection. */ + if (InitializeInjectionNvtxExtension_fnptr) + { + init_fnptr = InitializeInjectionNvtxExtension_fnptr; + } + } +#endif + + if (out_init_fnptr) + { + *out_init_fnptr = init_fnptr; + } + + /* At this point, if `init_fnptr` is not set, no tool has specified an NVTX injection library. + Non-success result is returned, so that all NVTX API functions will be set to no-ops. */ + if (!init_fnptr) + { + return NVTX_ERR_NO_INJECTION_LIBRARY_AVAILABLE; + } + + return NVTX_SUCCESS; +} + +/* Avoid warnings about missing prototypes. */ +NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce) ( + nvtxExtModuleInfo_t* moduleInfo, intptr_t* moduleState); +NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce) ( + nvtxExtModuleInfo_t* moduleInfo, intptr_t* moduleState) +{ + intptr_t old; + + NVTX_INFO( "%s\n", __FUNCTION__ ); + + if (*moduleState == NVTX_EXTENSION_LOADED) + { + NVTX_INFO("Module loaded\n"); + return; + } + + NVTX_ATOMIC_CAS_PTR( + old, + moduleState, + NVTX_EXTENSION_STARTING, + NVTX_EXTENSION_FRESH); + if (old == NVTX_EXTENSION_FRESH) + { + NvtxExtInitializeInjectionFunc_t init_fnptr = + NVTX_VERSIONED_IDENTIFIER(nvtxExtGlobals1).injectionFnPtr; + int entryPointStatus = 0; + int forceAllToNoops = 0; + size_t s; + + /* Load and initialize injection library, which will assign the function pointers. */ + if (init_fnptr == 0) + { + int result = 0; + + /* Try to load vanilla NVTX first. */ + nvtxInitialize(0); + + result = NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)(&init_fnptr); + /* At this point `init_fnptr` will be either 0 or a real function. */ + + if (result == NVTX_SUCCESS) + { + NVTX_VERSIONED_IDENTIFIER(nvtxExtGlobals1).injectionFnPtr = init_fnptr; + } + else + { + NVTX_ERR("Failed to load injection library\n"); + } + } + + if (init_fnptr != 0) + { + /* Invoke injection library's initialization function. If it returns + 0 (failure) and a dynamic injection was loaded, unload it. */ + entryPointStatus = init_fnptr(moduleInfo); + if (entryPointStatus == 0) + { + NVTX_ERR("Failed to initialize injection library -- initialization function returned 0\n"); + } + } + + /* Clean up any functions that are still uninitialized so that they are + skipped. Set all to null if injection init function failed as well. */ + forceAllToNoops = (init_fnptr == 0) || (entryPointStatus == 0); + for (s = 0; s < moduleInfo->segmentsCount; ++s) + { + nvtxExtModuleSegment_t* segment = moduleInfo->segments + s; + size_t i; + for (i = 0; i < segment->slotCount; ++i) + { + if (forceAllToNoops || (segment->functionSlots[i] == NVTX_EXTENSION_FRESH)) + { + segment->functionSlots[i] = NVTX_EXTENSION_DISABLED; + } + } + } + + NVTX_MEMBAR(); + + /* Signal that initialization has finished and the assigned function + pointers will be used. */ + NVTX_ATOMIC_WRITE_PTR(moduleState, NVTX_EXTENSION_LOADED); + } + else /* Spin-wait until initialization has finished. */ + { + NVTX_MEMBAR(); + while (*moduleState != NVTX_EXTENSION_LOADED) + { + NVTX_YIELD(); + NVTX_MEMBAR(); + } + } +} + +#ifdef __cplusplus +} +#endif /* __cplusplus */ diff --git a/src/main/cpp/profiler/nvtx3/nvtxExtDetail/nvtxExtPayloadTypeInfo.h b/src/main/cpp/profiler/nvtx3/nvtxExtDetail/nvtxExtPayloadTypeInfo.h new file mode 100644 index 0000000000..6a30e6633a --- /dev/null +++ b/src/main/cpp/profiler/nvtx3/nvtxExtDetail/nvtxExtPayloadTypeInfo.h @@ -0,0 +1,151 @@ +/* +* Copyright 2021-2023 NVIDIA Corporation. All rights reserved. +* +* Licensed under the Apache License v2.0 with LLVM Exceptions. +* See https://llvm.org/LICENSE.txt for license information. +* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +*/ + +#ifndef NVTX_EXT_IMPL_PAYLOAD_GUARD +#error Never include this file directly -- it is automatically included by nvToolsExtPayload.h (except when NVTX_NO_IMPL is defined). +#endif + +typedef void* nvtx_payload_pointer_type; + +#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) +#include +#include +#endif + +/* `alignof` is available as of C11 or C++11. */ +#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || (defined(__cplusplus) && __cplusplus >= 201103L) + +#define nvtx_alignof(type) alignof(type) +#define nvtx_alignof2(type,tname) alignof(type) + +#else /* (__STDC_VERSION__ >= 201112L) || (__cplusplus >= 201103L) */ + +/* Create helper structs to determine type alignment. */ +#define MKTYPEDEF(type) typedef struct {char c; type d;} _nvtx_##type +#define MKTYPEDEF2(type,tname) typedef struct {char c; type d;} _nvtx_##tname + +MKTYPEDEF(char); +MKTYPEDEF2(unsigned char, uchar); +MKTYPEDEF(short); +MKTYPEDEF2(unsigned short, ushort); +MKTYPEDEF(int); +MKTYPEDEF2(unsigned int, uint); +MKTYPEDEF(long); +MKTYPEDEF2(unsigned long, ulong); +MKTYPEDEF2(long long, longlong); +MKTYPEDEF2(unsigned long long, ulonglong); + +MKTYPEDEF(int8_t); +MKTYPEDEF(uint8_t); +MKTYPEDEF(int16_t); +MKTYPEDEF(uint16_t); +MKTYPEDEF(int32_t); +MKTYPEDEF(uint32_t); +MKTYPEDEF(int64_t); +MKTYPEDEF(uint64_t); + +MKTYPEDEF(float); +MKTYPEDEF(double); +MKTYPEDEF2(long double, longdouble); + +MKTYPEDEF(size_t); +MKTYPEDEF(nvtx_payload_pointer_type); + +MKTYPEDEF(wchar_t); + +/* `char8_t` is available as of C++20 or C23 */ +#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) || (defined(__cplusplus) && __cplusplus >= 201811L) + MKTYPEDEF(char8_t); +#endif + +/* `char16_t` and `char32_t` are available as of C++11 or C11 */ +#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 200704L) + MKTYPEDEF(char16_t); + MKTYPEDEF(char32_t); +#endif + +/* C requires to include stddef.h to use `offsetof` */ +#ifndef __cplusplus +#include +#endif + +#define nvtx_alignof(tname) offsetof(_nvtx_##tname, d) +#define nvtx_alignof2(type, tname) offsetof(_nvtx_##tname, d) + +#endif /* __STDC_VERSION__ >= 201112L */ + +#undef MKTYPEDEF +#undef MKTYPEDEF2 + +/* + * Helper array to get the alignment for each predefined C/C++ language type. + * The order of entries must match the values in`enum nvtxPayloadSchemaEntryType`. + * + * In C++, `const` variables use internal linkage by default, but we need it to + * be public (extern) since weak declarations must be public. + */ +NVTX_LINKONCE_DEFINE_GLOBAL +#ifdef __cplusplus +extern +#endif +const nvtxPayloadEntryTypeInfo_t +NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadTypeInfo)[NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE] = +{ + /* The first entry contains this array's length and the size of each entry in this array. */ + {NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE, sizeof(nvtxPayloadEntryTypeInfo_t)}, + + /*** C integer types ***/ + /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR */ {sizeof(char), nvtx_alignof(char)}, + /* NVTX_PAYLOAD_ENTRY_TYPE_UCHAR */ {sizeof(unsigned char), nvtx_alignof2(unsigned char, uchar)}, + /* NVTX_PAYLOAD_ENTRY_TYPE_SHORT */ {sizeof(short), nvtx_alignof(short)}, + /* NVTX_PAYLOAD_ENTRY_TYPE_USHORT */ {sizeof(unsigned short), nvtx_alignof2(unsigned short, ushort)}, + /* NVTX_PAYLOAD_ENTRY_TYPE_INT */ {sizeof(int), nvtx_alignof(int)}, + /* NVTX_PAYLOAD_ENTRY_TYPE_UINT */ {sizeof(unsigned int), nvtx_alignof2(unsigned int, uint)}, + /* NVTX_PAYLOAD_ENTRY_TYPE_LONG */ {sizeof(long), nvtx_alignof(long)}, + /* NVTX_PAYLOAD_ENTRY_TYPE_ULONG */ {sizeof(unsigned long), nvtx_alignof2(unsigned long, ulong)}, + /* NVTX_PAYLOAD_ENTRY_TYPE_LONGLONG */ {sizeof(long long), nvtx_alignof2(long long, longlong)}, + /* NVTX_PAYLOAD_ENTRY_TYPE_ULONGLONG */ {sizeof(unsigned long long), nvtx_alignof2(unsigned long long,ulonglong)}, + + /*** Integer types with explicit size ***/ + /* NVTX_PAYLOAD_ENTRY_TYPE_INT8 */ {sizeof(int8_t), nvtx_alignof(int8_t)}, + /* NVTX_PAYLOAD_ENTRY_TYPE_UINT8 */ {sizeof(uint8_t), nvtx_alignof(uint8_t)}, + /* NVTX_PAYLOAD_ENTRY_TYPE_INT16 */ {sizeof(int16_t), nvtx_alignof(int16_t)}, + /* NVTX_PAYLOAD_ENTRY_TYPE_UINT16 */ {sizeof(uint16_t), nvtx_alignof(uint16_t)}, + /* NVTX_PAYLOAD_ENTRY_TYPE_INT32 */ {sizeof(int32_t), nvtx_alignof(int32_t)}, + /* NVTX_PAYLOAD_ENTRY_TYPE_UINT32 */ {sizeof(uint32_t), nvtx_alignof(uint32_t)}, + /* NVTX_PAYLOAD_ENTRY_TYPE_INT64 */ {sizeof(int64_t), nvtx_alignof(int64_t)}, + /* NVTX_PAYLOAD_ENTRY_TYPE_UINT64 */ {sizeof(uint64_t), nvtx_alignof(uint64_t)}, + + /*** C floating point types ***/ + /* NVTX_PAYLOAD_ENTRY_TYPE_FLOAT */ {sizeof(float), nvtx_alignof(float)}, + /* NVTX_PAYLOAD_ENTRY_TYPE_DOUBLE */ {sizeof(double), nvtx_alignof(double)}, + /* NVTX_PAYLOAD_ENTRY_TYPE_LONGDOUBLE */ {sizeof(long double), nvtx_alignof2(long double, longdouble)}, + + /* NVTX_PAYLOAD_ENTRY_TYPE_SIZE */ {sizeof(size_t), nvtx_alignof(size_t)}, + /* NVTX_PAYLOAD_ENTRY_TYPE_ADDRESS */ {sizeof(nvtx_payload_pointer_type), nvtx_alignof(nvtx_payload_pointer_type)}, + + /*** Special character types ***/ + /* NVTX_PAYLOAD_ENTRY_TYPE_WCHAR */ {sizeof(wchar_t), nvtx_alignof(wchar_t)}, + +#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) || (defined(__cplusplus) && __cplusplus >= 201811L) + /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR8 */ {sizeof(char8_t), nvtx_alignof(char8_t)}, +#else + /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR8 */ {0, 0}, +#endif + +#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 200704L) + /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR16 */ {sizeof(char16_t), nvtx_alignof(char16_t)}, + /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR32 */ {sizeof(char32_t), nvtx_alignof(char32_t)} +#else + /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR16 */ {0, 0}, + /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR32 */ {0, 0} +#endif +}; + +#undef nvtx_alignof +#undef nvtx_alignof2 \ No newline at end of file diff --git a/src/main/cpp/profiler/nvtx3/nvtxExtDetail/nvtxExtTypes.h b/src/main/cpp/profiler/nvtx3/nvtxExtDetail/nvtxExtTypes.h new file mode 100644 index 0000000000..bcad095a0c --- /dev/null +++ b/src/main/cpp/profiler/nvtx3/nvtxExtDetail/nvtxExtTypes.h @@ -0,0 +1,44 @@ +/* +* Copyright 2021 NVIDIA Corporation. All rights reserved. +* +* Licensed under the Apache License v2.0 with LLVM Exceptions. +* See https://llvm.org/LICENSE.txt for license information. +* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +*/ + +/* This header defines types which are used by the internal implementation +* of NVTX and callback subscribers. API clients do not use these types, +* so they are defined here instead of in nvToolsExt.h to clarify they are +* not part of the NVTX client API. */ + +#ifndef NVTXEXTTYPES_H +#define NVTXEXTTYPES_H + +#ifndef NVTX_EXT_TYPES_GUARD +#error Never include this file directly -- it is automatically included by nvToolsExt[EXTENSION].h. +#endif + +typedef intptr_t (NVTX_API * NvtxExtGetExportFunction_t)(uint32_t exportFunctionId); + +typedef struct nvtxExtModuleSegment_t +{ + size_t segmentId; + size_t slotCount; + intptr_t* functionSlots; +} nvtxExtModuleSegment_t; + +typedef struct nvtxExtModuleInfo_t +{ + uint16_t nvtxVer; + uint16_t structSize; + uint16_t moduleId; + uint16_t compatId; + size_t segmentsCount; + nvtxExtModuleSegment_t* segments; + NvtxExtGetExportFunction_t getExportFunction; + const void* extInfo; +} nvtxExtModuleInfo_t; + +typedef int (NVTX_API * NvtxExtInitializeInjectionFunc_t)(nvtxExtModuleInfo_t* moduleInfo); + +#endif /* NVTXEXTTYPES_H */ \ No newline at end of file diff --git a/src/main/cpp/profiler/nvtxw3.cpp b/src/main/cpp/profiler/nvtxw3.cpp new file mode 100644 index 0000000000..b18aba73da --- /dev/null +++ b/src/main/cpp/profiler/nvtxw3.cpp @@ -0,0 +1,874 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Licensed under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + */ + +#include +#include +#include +#include + +#if defined(_WIN32) +#include +#else +#include +#include +#if defined (_QNX_SOURCE) +#include +#include +#else +#include +#endif +#include +#include +#endif + +#if defined(__APPLE__) +#include +#endif + +#include "nvtxw3.h" + +/*-------------------------------------------------------------*/ +/* Path string helpers -- implement here to avoid dependencies */ + +#if defined(_WIN32) +static const char pathSep = '\\'; +#if defined(NVTXW3_TEST_PATH_UTILITIES) +static const char pathDelimiter = ';'; +#endif +static const size_t initialPathBufSize = MAX_PATH; /* Grows if not big enough */ +#define NVTXW3_DLLHANDLE HMODULE +#define NVTXW3_DLLOPEN(x) LoadLibraryA(x) +#define NVTXW3_DLLFUNC GetProcAddress +#define NVTXW3_DLLCLOSE FreeLibrary +#else +static const char pathSep = '/'; +#if defined(NVTXW3_TEST_PATH_UTILITIES) +static const char pathDelimiter = ':'; +#endif +static const size_t initialPathBufSize = 260; /* Grows if not big enough */ +#define NVTXW3_DLLHANDLE void* +#define NVTXW3_DLLOPEN(x) dlopen(x, RTLD_LAZY) +#define NVTXW3_DLLFUNC dlsym +#define NVTXW3_DLLCLOSE dlclose +#endif + +#if defined(NVTXW3_TEST_PATH_UTILITIES) +/* If native path separator is not forward slash (e.g. backslash on Windows), +* do in-place conversion of forward slashes to native path separator. */ +static void ForwardSlashesToNative(char* path) +{ +#if _WIN32 + char* cur; + if (!path) return; + for (cur = path; *cur; ++cur) + { + if (*cur == '/') *cur = pathSep; + } +#else + (void)path; +#endif +} +#endif + +/* Take pointers to string buffer begin/end. End must equal begin + strlen(begin), +* or NULL, in which case it will be set to begin + strlen(begin). +* Remove trailing slashes in-place by overwriting first trailing slash with null. */ +static void StripTrailingSlashes(char* path) +{ + char* newPathEnd; + char* pathEnd = path + strlen(path); + + newPathEnd = pathEnd; + while (newPathEnd != path) + { + char* cur = newPathEnd - 1; + if (*cur != pathSep) break; + newPathEnd = cur; + } + if (newPathEnd != pathEnd) + { + *newPathEnd = '\0'; + } +} + +/* Take pointers to string buffer begin/end. End must equal begin + strlen(begin), +* or NULL, in which case it will be set to begin + strlen(begin). +* Remove leading slashes in-place by memmove-ing from first character after leading +* slashes to beginning of buffer, including null terminator. */ +#if defined(NVTXW3_TEST_PATH_UTILITIES) +static char* AfterLeadingSlashes(char* cur) +{ + for (; *cur && *cur == pathSep; ++cur); + return cur; +} +#endif +static const char* AfterLeadingSlashesConst(const char* cur) +{ + for (; *cur && *cur == pathSep; ++cur); + return cur; +} + +#if defined(NVTXW3_TEST_PATH_UTILITIES) +/* Take pointers to string buffer begin/end. End must equal begin + strlen(begin), +* or NULL, in which case it will be set to begin + strlen(begin). +* Remove leading slashes in-place by memmove-ing from first character after leading +* slashes to beginning of buffer, including null terminator. */ +static void StripLeadingSlashes(char* path) +{ + char* afterSlashes = AfterLeadingSlashes(path); + if (afterSlashes != path) + { + size_t sizeAfterSlashesWithNull = strlen(afterSlashes) + 1; + memmove(path, afterSlashes, sizeAfterSlashesWithNull); + } +} +#endif + +/* Take pointers to string buffer begin/end. End must equal begin + strlen(begin), +* or NULL, in which case it will be set to begin + strlen(begin). +* Returns pointer to heap-allocated copy of input, must be freed with free(). */ +static char* AssignHeapString(char* lhs, const char* rhs) +{ + size_t lenWithNull; + + if (!rhs) return NULL; + + lenWithNull = strlen(rhs) + 1; + lhs = (char*)realloc(lhs, lenWithNull); + memcpy(lhs, rhs, lenWithNull); + return lhs; +} + +static char* AssignHeapStringFromRange(char* lhs, const char* rhsBegin, const char* rhsEnd) +{ + size_t lenWithoutNull; + + if (!rhsBegin || !rhsEnd) return NULL; + + lenWithoutNull = rhsEnd - rhsBegin; + lhs = (char*)realloc(lhs, lenWithoutNull + 1); + memcpy(lhs, rhsBegin, lenWithoutNull); + lhs[lenWithoutNull] = '\0'; + return lhs; +} + +/* Take pointers to string buffer begin/end. End must equal begin + strlen(begin), +* or NULL, in which case it will be set to begin + strlen(begin). +* Returns pointer to heap-allocated copy of input, must be freed with free(). */ +static char* MakeHeapString(const char* str) +{ + return AssignHeapString(NULL, str); +} + +static char* MakeHeapStringFromRange(const char* strBegin, const char* strEnd) +{ + return AssignHeapStringFromRange(NULL, strBegin, strEnd); +} + +#if defined(NVTXW3_TEST_PATH_UTILITIES) +static char* MakeHeapStringWithNativeSlashes(const char* str) +{ + char* buf = AssignHeapString(NULL, str); + ForwardSlashesToNative(buf); + return buf; +} + +/* Take pointer to a HeapString (lhs) and any C string (rhs), append rhs to lhs, +* reallocating the heap memory for lhs if necessary. Returns pointer to result +* HeapString, which may or may not be the same pointer passed in as lhs. +* HeapString must be freed with free(). */ +static char* AppendToHeapString(char* lhs, const char* rhs) +{ + size_t lenLhs, lenRhs; + lenLhs = strlen(lhs); + lenRhs = strlen(rhs); + if (lenRhs == 0) return lhs; + lhs = (char*)realloc(lhs, lenLhs + lenRhs + 1); + memcpy(lhs + lenLhs, rhs, lenRhs + 1); + return lhs; +} +#endif + +/* Take pointer to a HeapString (lhs) and any C string (rhs), append rhs to lhs, +* with a path separator between them, reallocating the heap memory for lhs if +* necessary. If rhs is null or empty, then the result is lhs unmodified. If +* lhs is null or empty and rhs is not, then the result is a path separator +* followed by rhs. Returns pointer to result HeapString, which may or may not +* be the same pointer passed in as lhs. HeapString must be freed with free(). */ +static char* AppendToHeapStringWithSep(char* lhs, const char* rhs) +{ + size_t lenLhs, lenRhs; + lenLhs = strlen(lhs); + lenRhs = strlen(rhs); + if (lenRhs == 0) return lhs; + lhs = (char*)realloc(lhs, lenLhs + lenRhs + 2); + lhs[lenLhs] = pathSep; + memcpy(lhs + lenLhs + 1, rhs, lenRhs + 1); + return lhs; +} + +/* dir is a HeapString. If dir is empty or just slashes, result will be a +* path relative to the root, i.e. beginning with a path separator. +* relativePath must be a valid relative path (not empty, not just slashes). +* Returns pointer to result HeapString, which may or may not be the same +* pointer passed in as lhs. HeapString must be freed with free(). */ +static char* AppendToPathHeapString(char* dir, const char* relativePath) +{ + const char* relPathAfterLeadingSlashes; + relPathAfterLeadingSlashes = AfterLeadingSlashesConst(relativePath); + StripTrailingSlashes(dir); + return AppendToHeapStringWithSep(dir, relPathAfterLeadingSlashes); +} + +static char* LoadFileIntoHeapString(const char* filename) +{ + FILE* f; + char* buf; + int err; + long pos; + size_t size; + size_t bytesRead; + + f = fopen(filename, "rb"); + if (!f) return NULL; + err = fseek(f, 0, SEEK_END); + if (err) { fclose(f); return NULL; } + pos = ftell(f); + if (pos < 0) { fclose(f); return NULL; } + rewind(f); + size = (size_t)pos; + + buf = (char*)malloc(size + 1); + if (!buf) { fclose(f); return NULL; } + bytesRead = fread(buf, 1, size, f); + if (bytesRead < size) { fclose(f); free(buf); return NULL; } + + buf[size] = '\0'; + fclose(f); + return buf; +} + +#if defined(NVTXW3_TEST_PATH_UTILITIES) +static int HasSlashes(const char* cur) +{ + for (; *cur; ++cur) + { + if (*cur == pathSep) return 1; + } + return 0; +} + +static int HasTrailingSlash(const char* str) +{ + size_t len = strlen(str); + if (len == 0) return 0; + return str[len-1] == pathSep; +} +#endif + +static char* GetCurrentWorkingDir() +{ +#if defined(_WIN32) + DWORD size; + char* buf; + + // Returns size including space for null terminator + size = GetCurrentDirectoryA(0, NULL); + buf = (char*)malloc(size); + GetCurrentDirectoryA(size, buf); + return buf; +#else + size_t size = initialPathBufSize; + char* buf; + + buf = (char*)malloc(size); + while (!getcwd(buf, size)) + { + size *= 2; + buf = (char*)realloc(buf, size); + } + buf = (char*)realloc(buf, strlen(buf) + 1); + return buf; +#endif +} + +#if defined(NVTXW3_TEST_PATH_UTILITIES) +/* Take pointer to string buffer of possibly-relative path, and returns +* equivalent absolute path. Input path must not be empty. +* Returns pointer to heap-allocated string, must be freed with free(). */ +static char* AbsolutePath(const char* path) +{ +#if defined(_WIN32) + size_t size; + char* buf; + + if (!path) return NULL; + + // Returns size including space for null terminator + size = (size_t)GetFullPathNameA(path, 0, NULL, NULL); + buf = (char*)malloc(size); + GetFullPathNameA(path, size, buf, NULL); + return buf; +#else + if (!path) return NULL; + + return path[0] == pathSep + ? MakeHeapString(path) // Absolute already + : AppendToPathHeapString(GetCurrentWorkingDir(), path); +#endif +} +#endif + +/* Take pointer to heap string of path, and modifies it in-place to be its +* parent directory, i.e. the directory containing the input file/directory. +* String is shortened, but not reallocated, permitting possibly faster +* appending of different path later. Returns the pointer passed in without +* modifying it for convenient chaining of path functions. If input path is +* NULL, NULL is returned. If input is an empty string, or root directory, +* the heap string will be set to an empty string to indicate there is no +* parent directory. Returned pointer to heap-allocated string must be +* freed with free(). */ +static char* ToParentDir(char* path) +{ + char* cur; + + if (!path) return NULL; + + StripTrailingSlashes(path); + + for (cur = path + strlen(path); cur >= path; --cur) + { + if (*cur == pathSep) + { + /* Found the last slash */ + if (cur == path) + { + /* Special case -- last slash is first character + * in buffer. Trailing slashes were trimmed first, + * so this can only occur when ParentDir should + * return the root directory. This is the only + * case where we want to keep the slash we found, + * so write the null terminator after the slash. */ + *(cur + 1) = '\0'; + } + else + { + /* Change slash to null, terminating the string + * before the last slash */ + *cur = '\0'; + } + return path; + }; + } + + /* No slashes found, so there's no parent directory. Assign empty + * string by nulling first character, which is safe because all heap + * strings must be at least one byte long. */ + path[0] = '\0'; + return path; +} + +#if defined(NVTXW3_TEST_PATH_UTILITIES) +/* Take pointer to string buffer of path, and returns the parent directory, +* i.e. the directory containing the input file/directory. If input path is +* NULL, empty string, or root directory, NULL is returned to indicate there +* is no parent directory, so return value must be NULL-checked. +* Returns pointer to heap-allocated string, must be freed with free(). */ +static char* ParentDir(const char* path) +{ + char* buf; + + if (!path) return NULL; + + buf = ToParentDir(MakeHeapString(path)); + + if (strlen(buf) == 0) + { + /* No slashes found, so there's no parent directory */ + free(buf); + return NULL; + } + else + { + return buf; + } +} + +static int PathExists(const char* path) +{ +#if defined(_WIN32) + DWORD result = GetFileAttributesA(path); + return result != INVALID_FILE_ATTRIBUTES; +#else + int result = access(path, F_OK); + return result != -1; +#endif +} +#endif + +/* Return a heap string containing the full path of the current process's +* executable file. Buffer allocated may be a little larger than the path +* string it contains, and is not realloc'ed to fit since typical usage of +* this function involves getting the parent directory and appending to it. +* Returned pointer to heap-allocated string must be freed with free(). */ +static char* GetCurrentProcessPath() +{ + char* buf; +#if defined(_WIN32) + { + DWORD size = initialPathBufSize; + DWORD newSize; + buf = NULL; + while (1) + { + buf = (char*)realloc(buf, size); + newSize = GetModuleFileNameA(NULL, buf, size); + if (newSize < size) break; + size *= 2; + } + } +#elif defined(__APPLE__) + { + size_t size = PROC_PIDPATHINFO_MAXSIZE; + pid_t pid; + buf = (char*)malloc(size); + pid = getpid(); + size = proc_pidpath(pid, buf, size); + if (size == 0) + { + buf[0] = '\0'; + } + } +#elif defined(__QNX__) + { + size_t size = fpathconf(0, _PC_MAX_INPUT); + if (size <= 0) + { + size = 4096; + } + ++size; + buf = (char*)malloc(size); + _cmdname(buf); + } +#else + { + size_t size = initialPathBufSize; + ssize_t bytesReadSigned; + size_t bytesRead; + const char* linkName = "/proc/self/exe"; + buf = NULL; + while (1) + { + buf = (char*)realloc(buf, size); + bytesReadSigned = readlink(linkName, buf, size); + if (bytesReadSigned < 0) { free(buf); return NULL; } + bytesRead = (size_t)bytesReadSigned; + if (bytesRead < size) break; + size *= 2; + } + buf[bytesRead] = '\0'; + } +#endif + return buf; +} + +static char* GetCurrentProcessDir() +{ + return ToParentDir(GetCurrentProcessPath()); +} + +static int KVPConsumerForSimplify( + void* state, + const char* readKeyBegin, + const char* readKeyEnd, + const char* readValBegin, + const char* readValEnd) +{ + char* curWrite = *(char**)state; + size_t size; + /* Safe to cast away const here, since we are pointing at a non-const heap string */ + char* keyBegin = (char*)readKeyBegin; + char* keyEnd = (char*)readKeyEnd; + char* valBegin = (char*)readValBegin; + char* valEnd = (char*)readValEnd; + + /* Rebuild the simplified config line at the write pointer, using memmove since the + * ranges may overlap or even be the exact same range. */ + size = keyEnd - keyBegin; + memmove(curWrite, keyBegin, size); + curWrite += size; + + *curWrite = '='; + ++curWrite; + + size = valEnd - valBegin; + memmove(curWrite, valBegin, size); + curWrite += size; + + *curWrite = '\n'; + ++curWrite; + + *(char**)state = curWrite; + + return 0; +} + +static char* SimplifyConfigHeapString(char* config) +{ + char* curWrite = config; + + nvtxwConsumeConfigString(config, KVPConsumerForSimplify, &curWrite); + + *curWrite = '\0'; + return (char*)realloc(config, strlen(config) + 1); +} + +typedef struct GetInitModeState_t +{ + int modeFound; + int modeStringFound; + int mode; + char* modeString; +} GetInitModeState_t; + +static int KVPConsumerForGetInitMode( + void* statePtr, + const char* keyBegin, + const char* keyEnd, + const char* valBegin, + const char* valEnd) +{ + GetInitModeState_t* state = (GetInitModeState_t*)statePtr; + const char* const keyMode = "InitMode"; + const char* const keyModeString = "InitModeString"; + const size_t keyModeLen = strlen(keyMode); + const size_t keyModeStringLen = strlen(keyModeString); + size_t keyLen; + + keyLen = keyEnd - keyBegin; + + if (!state->modeFound + && keyLen == keyModeLen + && strncmp(keyBegin, keyMode, keyLen) == 0) + { + int mode; + char* val; + val = MakeHeapStringFromRange(valBegin, valEnd); + mode = atoi(val); + free(val); + state->mode = mode; + state->modeFound = 1; + } + + if (!state->modeStringFound + && keyLen == keyModeStringLen + && strncmp(keyBegin, keyModeString, keyLen) == 0) + { + char* val; + val = MakeHeapStringFromRange(valBegin, valEnd); + state->modeString = val; + state->modeStringFound = 1; + } + + return state->modeFound && + (state->mode == NVTXW3_INIT_MODE_SEARCH_DEFAULT || state->modeStringFound); +} + +/* Returns zero for success, and writes out params mode and modeString (the latter +* is a HeapString). If mode is not detected, or if the mode requires a modeString +* and modeString is not detected, return non-zero error code. */ +static int GetInitModeFromConfig(const char* config, int* mode, char** modeString) +{ + GetInitModeState_t state = {0}; + + if (!mode || !modeString) return 1; + *mode = 0; + *modeString = NULL; + + nvtxwConsumeConfigString(config, KVPConsumerForGetInitMode, &state); + + /* Always an error if mode not found */ + if (!state.modeFound) + { + free(state.modeString); + return 1; + } + + /* Except in default mode, it's an error if modeString not found */ + if (state.mode != NVTXW3_INIT_MODE_SEARCH_DEFAULT && !state.modeStringFound) + { + return 2; + } + + *mode = state.mode; + *modeString = state.modeString; + return 0; +} + +/*-------------------------------------------------------------*/ +/* Backend loader helpers */ + +static nvtxwResultCode_t InitLibraryFilename( + const char* filename, /* required */ + const char* configString, /* optional */ + nvtxwGetInterface_t* getInterfaceFunc, /* already null-checked */ + void** moduleHandle) /* optional */ +{ + /* modeString is the filename of the library to load */ + NVTXW3_DLLHANDLE hModule; + nvtxwLoadImplementation_t pfnLoadImplementation; + nvtxwGetInterface_t tempGetInterfaceFunc = NULL; + nvtxwResultCode_t result; + char* configSimple = NULL; + + *getInterfaceFunc = NULL; + if (moduleHandle) *moduleHandle = NULL; + + if (!filename) + { + return NVTXW3_RESULT_INVALID_ARGUMENT; + } + + hModule = NVTXW3_DLLOPEN(filename); + if (!hModule) + { + return NVTXW3_RESULT_LIBRARY_NOT_FOUND; + } + + pfnLoadImplementation = (nvtxwLoadImplementation_t)NVTXW3_DLLFUNC(hModule, "nvtxwLoadImplementation"); + if (!pfnLoadImplementation) + { + NVTXW3_DLLCLOSE(hModule); + return NVTXW3_RESULT_LOADER_SYMBOL_MISSING; + } + + if (configString) + { + configSimple = SimplifyConfigHeapString(MakeHeapString(configString)); + } + + result = pfnLoadImplementation(configSimple, &tempGetInterfaceFunc); + free(configSimple); + if (result != NVTXW3_RESULT_SUCCESS || !tempGetInterfaceFunc) + { + NVTXW3_DLLCLOSE(hModule); + return result; + } + + /* Success - now write to output params */ + *getInterfaceFunc = tempGetInterfaceFunc; + if (moduleHandle) + { + void* mod = (void*)hModule; + *moduleHandle = mod; + } + + return NVTXW3_RESULT_SUCCESS; +} + +static nvtxwResultCode_t InitSearchDefault( + const char* configString, /* optional */ + nvtxwGetInterface_t* getInterfaceFunc, /* already null-checked */ + void** moduleHandle) /* optional */ +{ + nvtxwResultCode_t result; + char* filename; + + /* 1. Directory of current process's executable */ + filename = AppendToPathHeapString(GetCurrentProcessDir(), NVTXW3_LIB_FILENAME_DEFAULT); + result = InitLibraryFilename( + filename, configString, getInterfaceFunc, moduleHandle); + free(filename); + if (result == NVTXW3_RESULT_SUCCESS) + { + return NVTXW3_RESULT_SUCCESS; + } + + /* 2. Standard search paths for dynamic libraries */ + result = InitLibraryFilename( + NVTXW3_LIB_FILENAME_DEFAULT, configString, getInterfaceFunc, moduleHandle); + if (result == NVTXW3_RESULT_SUCCESS) + { + return NVTXW3_RESULT_SUCCESS; + } + + /* 3. Current working directory (may not be included in standard search paths) */ + filename = AppendToPathHeapString(GetCurrentWorkingDir(), NVTXW3_LIB_FILENAME_DEFAULT); + result = InitLibraryFilename( + filename, configString, getInterfaceFunc, moduleHandle); + free(filename); + + /* No usable backend found */ + return NVTXW3_RESULT_LIBRARY_NOT_FOUND; +} + +static nvtxwResultCode_t InitLibraryDirectory( + const char* directory, /* required */ + const char* configString, /* optional */ + nvtxwGetInterface_t* getInterfaceFunc, /* already null-checked */ + void** moduleHandle) /* optional */ +{ + nvtxwResultCode_t result; + char* filename; + + if (!directory) return NVTXW3_RESULT_INVALID_ARGUMENT; + + filename = AppendToPathHeapString( + MakeHeapString(directory), NVTXW3_LIB_FILENAME_DEFAULT); + + result = InitLibraryFilename(filename, configString, getInterfaceFunc, moduleHandle); + free(filename); + + return result; +} + +static nvtxwResultCode_t InitConfigString( + const char* config, + nvtxwGetInterface_t* getInterfaceFunc, + void** moduleHandle) +{ + nvtxwResultCode_t result; + int err; + int mode = 0; + char* modeString = NULL; + + if (!config) return NVTXW3_RESULT_INVALID_ARGUMENT; + + err = GetInitModeFromConfig(config, &mode, &modeString); + if (err) + { + free(modeString); + return NVTXW3_RESULT_CONFIG_MISSING_LOADER_INFO; + } + + switch (mode) + { + case NVTXW3_INIT_MODE_SEARCH_DEFAULT : result = InitSearchDefault ( config, getInterfaceFunc, moduleHandle); break; + case NVTXW3_INIT_MODE_LIBRARY_FILENAME : result = InitLibraryFilename (modeString, config, getInterfaceFunc, moduleHandle); break; + case NVTXW3_INIT_MODE_LIBRARY_DIRECTORY: result = InitLibraryDirectory(modeString, config, getInterfaceFunc, moduleHandle); break; + default: result = NVTXW3_RESULT_UNSUPPORTED_LOADER_MODE; + } + + free(modeString); + return result; +} + +static nvtxwResultCode_t InitConfigEnvVar( + const char* configEnvVarName, + nvtxwGetInterface_t* getInterfaceFunc, + void** moduleHandle) +{ + const char* config; + + if (!configEnvVarName) return NVTXW3_RESULT_INVALID_ARGUMENT; + + config = getenv(configEnvVarName); + if (!config) return NVTXW3_RESULT_ENV_VAR_NOT_FOUND; + + return InitConfigString(config, getInterfaceFunc, moduleHandle); +} + +static nvtxwResultCode_t InitConfigFilename( + const char* configFilename, + nvtxwGetInterface_t* getInterfaceFunc, + void** moduleHandle) +{ + nvtxwResultCode_t result; + char* config; + + if (!configFilename) return NVTXW3_RESULT_INVALID_ARGUMENT; + + config = LoadFileIntoHeapString(configFilename); + if (!config) return NVTXW3_RESULT_CONFIG_NOT_FOUND; + + result = InitConfigString(config, getInterfaceFunc, moduleHandle); + free(config); + return result; +} + +static nvtxwResultCode_t InitConfigDirectory( + const char* configDirectory, + nvtxwGetInterface_t* getInterfaceFunc, + void** moduleHandle) +{ + nvtxwResultCode_t result; + char* configFilename; + + if (!configDirectory) return NVTXW3_RESULT_INVALID_ARGUMENT; + + configFilename = AppendToPathHeapString( + MakeHeapString(configDirectory), NVTXW3_CONFIG_FILENAME_DEFAULT); + + result = InitConfigFilename(configFilename, getInterfaceFunc, moduleHandle); + free(configFilename); + return result; +} + +/* #define NVTXW3_TEST_PATH_UTILITIES */ +#if defined(NVTXW3_TEST_PATH_UTILITIES) +#include +#endif + +NVTXW3_DECLSPEC nvtxwResultCode_t nvtxwInitialize( + nvtxwInitMode_t mode, + const char* modeString, + nvtxwGetInterface_t* getInterfaceFunc, + void** moduleHandle) +{ +#if defined(NVTXW3_TEST_PATH_UTILITIES) + TestPathUtilities(); +#endif + + if (!getInterfaceFunc) + { + return NVTXW3_RESULT_INVALID_ARGUMENT; + } + + switch (mode) + { + case NVTXW3_INIT_MODE_SEARCH_DEFAULT : return InitSearchDefault ( NULL, getInterfaceFunc, moduleHandle); + case NVTXW3_INIT_MODE_LIBRARY_FILENAME : return InitLibraryFilename (modeString, NULL, getInterfaceFunc, moduleHandle); + case NVTXW3_INIT_MODE_LIBRARY_DIRECTORY: return InitLibraryDirectory(modeString, NULL, getInterfaceFunc, moduleHandle); + case NVTXW3_INIT_MODE_CONFIG_FILENAME : return InitConfigFilename (modeString, getInterfaceFunc, moduleHandle); + case NVTXW3_INIT_MODE_CONFIG_DIRECTORY : return InitConfigDirectory (modeString, getInterfaceFunc, moduleHandle); + case NVTXW3_INIT_MODE_CONFIG_STRING : return InitConfigString (modeString, getInterfaceFunc, moduleHandle); + case NVTXW3_INIT_MODE_CONFIG_ENV_VAR : return InitConfigEnvVar (modeString, getInterfaceFunc, moduleHandle); + } + + return NVTXW3_RESULT_INVALID_INIT_MODE; +} + +NVTXW3_DECLSPEC void nvtxwUnload( + void* moduleHandle) +{ + nvtxwUnloadImplementation_t pfnUnload; + NVTXW3_DLLHANDLE hModule = (NVTXW3_DLLHANDLE)moduleHandle; + + if (!hModule) return; + + pfnUnload = (nvtxwUnloadImplementation_t)NVTXW3_DLLFUNC(hModule, "nvtxwUnloadImplementation"); + if (pfnUnload) + { + pfnUnload(); + } + + NVTXW3_DLLCLOSE(hModule); +} diff --git a/src/main/cpp/profiler/nvtxw3.h b/src/main/cpp/profiler/nvtxw3.h new file mode 100644 index 0000000000..d8dc40aa0f --- /dev/null +++ b/src/main/cpp/profiler/nvtxw3.h @@ -0,0 +1,549 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Licensed under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + */ + +#if !defined(NVTXW3_API) +#define NVTXW3_API + +#include + +#include /* For nvtxwConsumeConfigString inline implementation */ + +#ifdef __cplusplus +#define NVTXW3_DECLSPEC extern "C" +#else +#define NVTXW3_DECLSPEC extern +#endif + +typedef int32_t nvtxwResultCode_t; + +#define NVTXW3_RESULT_SUCCESS 0 +#define NVTXW3_RESULT_FAILED 1 +#define NVTXW3_RESULT_INVALID_ARGUMENT 2 +#define NVTXW3_RESULT_INVALID_INIT_MODE 3 +#define NVTXW3_RESULT_LIBRARY_NOT_FOUND 4 +#define NVTXW3_RESULT_CONFIG_NOT_FOUND 5 +#define NVTXW3_RESULT_LOADER_SYMBOL_MISSING 6 +#define NVTXW3_RESULT_LOADER_FAILED 7 +#define NVTXW3_RESULT_INTERFACE_ID_NOT_SUPPORTED 8 +#define NVTXW3_RESULT_CONFIG_MISSING_LOADER_INFO 9 +#define NVTXW3_RESULT_UNSUPPORTED_LOADER_MODE 10 +#define NVTXW3_RESULT_ENV_VAR_NOT_FOUND 11 + + +#if defined(_WIN32) +#define NVTXW3_LIB_PREFIX "" +#define NVTXW3_LIB_SUFFIX ".dll" +#else +#define NVTXW3_LIB_PREFIX "lib" +#define NVTXW3_LIB_SUFFIX ".so" +#endif + +/* Name of backend library file to use with init mode LIBRARY_DIRECTORY. +* Note the platform-dependent prefix and suffix above are added here. */ +#define NVTXW3_LIB_FILENAME_DEFAULT NVTXW3_LIB_PREFIX "nvtxw3" NVTXW3_LIB_SUFFIX + +/* Name of config library file to use with init mode CONFIG_DIRECTORY. +* Note the platform-dependent prefix and suffix above are added here. */ +#define NVTXW3_CONFIG_FILENAME_DEFAULT "nvtxw3.ini" + +/* Init modes: nvtxwInitialize takes nvtxwInitMode_t mode, one of the #defines +* below, and a modeString, whose meaning is dependent on the mode. These modes +* provide a variety of ways to find the NVTXW backend implementation library. */ +typedef int32_t nvtxwInitMode_t; + +/* Default search mode is to look for library with default filename, as defined +* by NVTXW3_LIB_FILENAME_DEFAULT, in the following order: +* 1. Directory of current process's executable +* 2. Standard search paths for dynamic libraries +* 3. Current working directory (may not be included in standard search paths) +* The modeString argument is ignored. */ +#define NVTXW3_INIT_MODE_SEARCH_DEFAULT 0 + +/* The modeString argument is interpreted as a filename or pathname to the +* backend library. The string is passed directly to the platform function +* for loading dynamic libraries (dlopen/LoadLibrary), so that function's +* behavior will apply. In general, a filename with no path will try the +* standard search paths, and an absolute path will be used verbatim. */ +#define NVTXW3_INIT_MODE_LIBRARY_FILENAME 1 + +/* The modeString argument is interpreted as a directory in which to search +* for the backend library, whose filename is defined by the macro +* NVTXW3_LIB_FILENAME_DEFAULT. */ +#define NVTXW3_INIT_MODE_LIBRARY_DIRECTORY 2 + +/* The modeString argument is interpreted as a filename or pathname to a +* config file, which will be used to find the backend library. If the +* filename is not an absolute path, it will be interpreted as relative +* to the current working direcrtory. See below for config file format. */ +#define NVTXW3_INIT_MODE_CONFIG_FILENAME 3 + +/* The modeString argument is interpreted as a directory in which to search +* for a config file, which will be used to find the backend library. The +* name of the config file is defined by NVTXW3_CONFIG_FILENAME_DEFAULT. + See below for config file format. */ +#define NVTXW3_INIT_MODE_CONFIG_DIRECTORY 4 + +/* The modeString argument is interpreted as the config string itself. +* See below for config string format. */ +#define NVTXW3_INIT_MODE_CONFIG_STRING 5 + +/* The modeString argument is interpreted as the name of an environment +* variable that contains the config string. See below for config string +* format. */ +#define NVTXW3_INIT_MODE_CONFIG_ENV_VAR 6 + +/* Config format (for both files and flat config strings): +* +* The format is key=value pairs, delimited by new-line characters or +* | (pipe) characters. Values are prohibited from containing those +* characters. If an entry begins with #, the entry (up to the next +* new-line or pipe) is discarded as a comment. +* +* When the config string is provided to the SessionBegin function +* as an argument, it is preprocessed to remove comments, blank lines, +* and to convert all entry delimiters to a single \n (line feed). +* This allows the tool to have a simpler config parser, and to print +* the config in a readable format. +* +* If a config specifies the same key multiple times, only the first +* appearance should be honored, and the subsequent appearances should +* be ignored. This allows a simple scan for a particular key to loop +* from the beginning until the first occurrence is found, and not have +* to loop through the rest for repeats. Note that this means building +* a map from keys to values should not overwrite existing values if a +* found key already exists in the map. This guarantee allows adding +* extra key/value pairs to a config string by prepending (to override +* existing keys) or appending (to set values only if they weren't set +* already). +* +* Keys are tool-specific, but the loader supports two keys: +* +* - InitMode=n +* Just like the argument to nvtxwInitialize, this allows the user +* to specify how to find the backend library, using one of the +* numeric values of the NVTXW3_INIT_MODE_ constants. Currently, +* only values 0-2 are supported for init modes specified within +* a config file/string. +* +* - InitModeString=string +* Just like the argument to nvtxwInitialize, this allows the user +* to specify a mode-specific string for how to find the backend +* library. This key is ignored for mode 0 (SEARCH_DEFAULT), but +* required for other modes. Currently, only mode values 0-2 are +* supported for init modes specified within a config file/string. +*/ + +/*--------- Helpers for consuming config strings ----------------*/ + +/* Typedef of function pointer for callback to use with nvtxwConsumeConfigString. +* The state pointer can be used for anything -- nvtxwConsumeConfigString passes +* it directly to the callback. The begin/end pointers for the key and value are +* pointing to ranges within the input config string. If the input config string +* is known to be non-const, this callback can safely cast away const and write +* to these pointers, for example when simplifying an input config string. To +* check if a key name is a particular string, use: +* strncmp("ExampleKeyName", keyBegin, keyEnd - keyBegin) == 0 +* In C++, you can construct a string using std::string(keyBegin, keyEnd). +* Return zero to continue consuming key/value pairs, or non-zero to stop. */ +typedef int (*nvtxwKeyValuePairConsumer_t)( + void* state, + const char* keyBegin, + const char* keyEnd, + const char* valBegin, + const char* valEnd); + +/* Parse config and call the consumer callback (see typedef above) on each +* valid key/value pair found in the config. Inline implementation provided +* here so backend implementations of NVTXW can use this function without +* having to include nvtxw3.c in their build. Users of the NVTXW API may +* also find it useful to parse/modify a config before passing it to NVTXW. */ +NVTX_LINKONCE_DEFINE_FUNCTION +void nvtxwConsumeConfigString(const char* config, nvtxwKeyValuePairConsumer_t consumer, void* state) +{ + const char* curRead = config; + const char* const lineBreak = "|\n\r"; + const char* const whitespace = " \t\v"; /* Not including lineBreak characters */ + int consumerStopRequested = 0; + + if (!config || !consumer) return; + + while (*curRead && !consumerStopRequested) + { + const char* lineBegin; + const char* lineEnd; + const char* keyBegin; + const char* keyEnd; + const char* valBegin; + const char* valEnd; + + /* Read a line, trimming leading whitespace - get pointers to begin/end */ + lineBegin = curRead + strspn(curRead, whitespace); + lineEnd = lineBegin + strcspn(lineBegin, lineBreak); + + /* Set read pointer to beginning of next line, so we can continue any time */ + curRead = lineEnd + strspn(lineEnd, lineBreak); + + /* Ignore line if it's only whitespace */ + if (lineBegin == lineEnd) continue; + /* Ignore line if it's is a comment */ + if (*lineBegin == '#') continue; + + /* Determine if line has a key and value delimited by '=' */ + keyBegin = lineBegin; + keyEnd = keyBegin; + while (keyEnd < lineEnd && *keyEnd != '=') ++keyEnd; + + /* Ignore line if there's no '=' in the line */ + if (keyEnd == lineEnd) continue; + /* Ignore line if there's no key name before '=' */ + if (keyEnd == keyBegin) continue; + + /* keyEnd now points at '=' after the key */ + valBegin = keyEnd + 1; + valBegin += strspn(valBegin, whitespace); + + /* Ignore line if all characters after '=' are whitespace */ + if (valBegin == lineEnd) continue; + + valEnd = lineEnd; + + /* Got begin/end pointers for key and value. We know there are non-whitespace + * characters in both of them, and their leading whitespace was already trimmed. + * Now trim their trailing whitespace. */ + while (strchr(whitespace, *(keyEnd - 1))) --keyEnd; + while (strchr(whitespace, *(valEnd - 1))) --valEnd; + + /* Now key and value begin/end pointers can be passed to the consumer */ + consumerStopRequested = consumer(state, keyBegin, keyEnd, valBegin, valEnd); + } +} + +/*--------- Initialization interface ---------*/ + +typedef int32_t nvtxwInterfaceId_t; + +typedef nvtxwResultCode_t (*nvtxwGetInterface_t)( + nvtxwInterfaceId_t interfaceId, + const void** iface); + +/* Initialize the NVTXW library by providing information on how to +* load the backend library that implements the NVTXW API. `mode` must +* be one of the NVTXW3_INIT_MODE_ constants. `modeString` is required +* for all modes besides 0 (SEARCH_DEFAULT), and has mode-specific +* interpretation. See comments for the mode constants. Backend library +* must provide an exported function symbol "nvtxwLoadImplementation", +* which must return NVTXW3_RESULT_SUCCESS and provide a pointer to its +* GetInterface function for initialization to be considered successful. +* Modes that search multiple locations will continue searching after an +* unsuccessful attempt to initialize a library. +* `getInterfaceFunc` is an out-param that must be non-null to receive +* a pointer to the backend's GetInterface function, which is used to +* make version-safe calls into the backend library. +* `moduleHandle` is an out-param that can be null. If non-null, it +* receives the platform-specific module handle of the loaded backend +* library when NVTXW3_RESULT_SUCCESS is returned. This can be passed +* to nvtxwUnload to unload the backend library. */ +NVTXW3_DECLSPEC nvtxwResultCode_t nvtxwInitialize( + nvtxwInitMode_t mode, + const char* modeString, + nvtxwGetInterface_t* getInterfaceFunc, + void** moduleHandle); + +/* A backend library may optionally provide an exported function symbol +* "nvtxwUnloadImplementation". If it does, nvtxwUnload will call this +* function before closing the module handle. This gives the backend a +* chance to free any memory tracked in global variables before it gets +* unloaded. Attempting to unload the backend is not necessary and not +* even recommended in common cases -- it is included to ensure clients +* of the NVTXW API have a way to cleanly pass a memory checker. */ +NVTXW3_DECLSPEC void nvtxwUnload( + void* moduleHandle); + +/*----- Typedefs for function pointers backend implements -----*/ + +typedef nvtxwResultCode_t (*nvtxwLoadImplementation_t)( + const char* configString, + nvtxwGetInterface_t* getInterfaceFunc); + +typedef void (*nvtxwUnloadImplementation_t)(); + +/*--------- Interface IDs ----------------*/ + +#define NVTXW3_INTERFACE_ID_CORE_V1 2 + +/*--------- INTERFACE_ID_CORE_V1 ---------*/ + +typedef struct nvtxwSessionHandle_t +{ + void* opaque; +} nvtxwSessionHandle_t; + +typedef struct nvtxwStreamHandle_t +{ + void* opaque; +} nvtxwStreamHandle_t; + +/* Growable struct of arguments for SessionBegin */ +typedef struct nvtxwSessionAttributes_v1 +{ + /* Guaranteed to increase when new members are added at the end */ + size_t struct_size; + + /* Provide a name for the session. + * Tools may display this name, or use it to name a file or directory + * representing the session. */ + const char* name; + + /* String containing configuration options for the session. + * Format is key=value, one per line, delimited by \n (line feed). + * Key names must not contain an = (equals sign), and values may + * contain any character except \r (carriage return), \n (line feed), + * or | (pipe). Tools shall use reasonable defaults for any config + * options not provided, and ignore any keys they do not support. + * See above for explanation of how config strings are provided. + * See tool-specific documentation for lists of supported keys. */ + const char* configString; +} nvtxwSessionAttributes_t; + +/* Define whether event ordering in a stream is based on event scope */ + +/* Event ordering is defined at the stream level, independent of +* event scopes within the stream. */ +#define NVTXW3_STREAM_ORDER_INTERLEAVING_NONE (int16_t)0 + +/* Event ordering is defined at the event scope level. This means +* ordering guarantees described by the other fields only apply to +* events of the same scope within the stream. The order of events +* in different scopes is unspecified. */ +#define NVTXW3_STREAM_ORDER_INTERLEAVING_EVENT_SCOPE (int16_t)1 + + +/* Define how events are fully or partially sorted in a stream. */ + +/* No guarantees can be made about event ordering in the stream. +* Events may need to be sorted by the tool. */ +#define NVTXW3_STREAM_ORDERING_TYPE_UNKNOWN (int16_t)0 + +/* All events represent single points in time and are fully or +* partially sorted in the order in which they occurred. */ +#define NVTXW3_STREAM_ORDERING_TYPE_STRICT (int16_t)1 + +/* Events that represent single points in time are fully or +* partially sorted in the order in which they occurred, and +* events representing time ranges in order of begin time. */ +#define NVTXW3_STREAM_ORDERING_TYPE_PACKED_RANGE_START (int16_t)2 + +/* Events that represent single points in time are fully or +* partially sorted in the order in which they occurred, and +* events representing time ranges in order of end time. */ +#define NVTXW3_STREAM_ORDERING_TYPE_PACKED_RANGE_END (int16_t)3 + +/* Define how to quantify skid when events are partially sorted. Only considered +* when orderingType is not UNKNOWN. Which events in the stream this applies to +* depends on the value of orderInterleaving. Which timestamp is used for ordering +* in an event with multiple timestamps depends on the value of orderingType. */ + +/* Events are fully sorted. */ +#define NVTXW3_STREAM_ORDERING_SKID_NONE 0 + +/* Events are partially sorted. The orderingSkidAmount field defines "skid" as +* a number of nanoseconds. For any two events A and B in the stream or scope +* (depending on interleaving level), where A is written into the stream before +* B, the tool must handle the case where B has a lower timestamp than A, but +* can assume B's timestamp cannot be more than the "skid" number of nanoseconds +* earlier than A's timestamp. Note that timestamp values in events cannot be +* assumed to be in units of nanoseconds, so this value cannot be added directly +* timestamp values without conversion. */ +#define NVTXW3_STREAM_ORDERING_SKID_TIME_NS 1 + +/* Events are partially sorted. The orderingSkidAmount field defines "skid" as +* a number of events. Regarding only events in a stream or scope (depending on +* interleaving level), for any event A, the next "skid" number of events after +* A may have a lower timestamp than A (by any amount of time), but no events +* written after that can have a lower timestamp than A. */ + +/* Events are partially sorted. No event in the stream is written +* more than the given number of events before any event written +* previously in the stream. Note that +* timestamps in events may not be in units of nanoseconds. */ +#define NVTXW3_STREAM_ORDERING_SKID_EVENT_COUNT 2 + +/* Growable struct of arguments for StreamOpen */ +typedef struct nvtxwStreamAttributes_v1 +{ + /* Guaranteed to increase when new members are added at the end */ + size_t struct_size; + + /* Name of a stream, used for identification from other streams. + * Tools typically will not display stream names. No two streams + * in the same session may have the same name. */ + const char* name; + + /* Name of NVTX domain to use implicitly for all events written into + * this stream. Since registered IDs are required to be unique within + * a domain, all ID registration functions called on this stream must + * not register the same ID value to mean different things. Multiple + * streams may use the same domain by specifying the same value for + * this string, and the tool is expected to combine registrations from + * these streams into a single set of registrations for the domain. + * If two streams share a domain, and a registration is made in one + * stream, the registered ID may be used immediately afterwards in the + * other stream, provided the usage occurs on the same thread -- it is + * implementation-defined whether or not this is supported if the usage + * occurs on a different thread. Tools are expected to combine data + * from any domains registered with the same name, even between NVTXW + * and NVTX, when merging data acquired from both APIs. */ + const char* nvtxDomainName; + + /* The default scope for all events in the stream that don't specify + * any scope. See comments below for nvtxwEventScopeAttributes_t. + * Note that "nvtxwStream" without brackets may not be used as a node + * name here -- this field is defining what that node name will mean + * in scope registrations occurring later in this stream. However, + * "nvtxwStream[name]" referencing a different stream by its name + * (see above) to use its default scope is supported, as long as that + * stream was successfully opened (and may be already closed). */ + const char* eventScopePath; + + /* Information about event ordering inside the stream. See comments + * for #defines above. */ + int16_t orderInterleaving; /* NVTXW3_STREAM_ORDER_INTERLEAVING_* */ + int16_t orderingType; /* NVTXW3_STREAM_ORDERING_TYPE_* */ + int32_t orderingSkid; /* NVTXW3_STREAM_ORDERING_SKID_* */ + int64_t orderingSkidAmount; /* Numeric value, dependent on skid type */ +} nvtxwStreamAttributes_t; + +/* Growable struct of arguments for EventScopeRegister */ +typedef struct nvtxwEventScopeAttributes_v1 +{ + /* Guaranteed to increase when new members are added at the end */ + size_t struct_size; + + /* Path delimited by / characters, relative to hierarchy root. + * Nodes in the path may use name[key] syntax to indicate an + * array of sibling nodes, which may be combined with other + * non-array nodes or different arrays at the same scope. + * Leading slashes are ignored. Node names should be ASCII + * printable characters, excluding the /, [, and ] characters, + * which have special meaning here. A set of reserved node + * names with special properties is given in the documentation + * for NVTX Deferred Events. "nvtxwStream" is a reserved node + * name that can be used as a path's root node, indicating the + * path is relative to the eventScopePath set for the stream + * in which the event scope is registered. "nvtxwStream[name]" + * refers to the eventScopePath of a stream in the session with + * matching name. Note that the NVTX domain is implicitly a + * child node of the scope, since multiple domains can assign + * events to the same scope, and tools should isolate events + * from separate domains. */ + const char* path; + + /* Static event scope ID must be provided, unique within the domain, + >= NVTX_EVENT_SCOPE_ID_STATIC_START, and + < NVTX_EVENT_SCOPE_ID_DYNAMIC_START */ + uint64_t scopeId; +} nvtxwEventScopeAttributes_t; + +/* nvtxwInterfaceCore_t is a growable struct of function pointers to +* the NVTX Writer (NVTXW) API. Breaking changes will not be made to +* this interface without also changing the interface ID passed to +* nvtxwGetInterface_t, e.g. NVTXW3_INTERFACE_ID_CORE_V1. Non-breaking +* are made by adding fields to the end of the struct, ensuring the +* value of 'struct_size' increases, so the presence of a member can +* be checked by comparing struct_size with that member's offset. */ +typedef struct nvtxwInterfaceCore_v1 +{ + /* Guaranteed to increase when new members are added at the end */ + size_t struct_size; + + /* Create a session, which represents a collection of trace data + * from one or more streams. Takes a growable struct of session + * attributes (see nvtxwSessionAttributes_t). */ + nvtxwResultCode_t (*SessionBegin)( + nvtxwSessionHandle_t* session, + const nvtxwSessionAttributes_t* attr); + + /* Notify the implementation that all trace data for the session + * has been provided, and the session may be destroyed. Depending + * on configuration options, ending a session may trigger behavior + * like writing an output file or opening a data viewer. */ + nvtxwResultCode_t (*SessionEnd)( + nvtxwSessionHandle_t session); + + /* Create a stream within a session. A stream is the object events + * are written to. The NVTX domain and event scope are set when + * creating a stream, allowing individual events to avoid repeating + * these fields. Since ID values for schemas, registered strings, + * etc. are only unique within a domain, all registrations that + * assign an ID are done within a stream, since the domain is fixed + * inside a stream. Other stream properties set at creation time + * are a name string, and information about the way events in the + * stream are ordered. */ + nvtxwResultCode_t (*StreamOpen)( + nvtxwStreamHandle_t* stream, + nvtxwSessionHandle_t session, + const nvtxwStreamAttributes_t* attr); + + /* Destroy the stream object. This is not expected to trigger a + * reaction in the implementation that no more events are coming; + * only ending a session is intended to have that effect. */ + nvtxwResultCode_t (*StreamClose)( + nvtxwStreamHandle_t stream); + + /* Register a scope ID to represent a scope path, so the ID can be + * used in events or schemas to efficiently indicate a scope. + * Static event scope ID must be provided, unique within the domain, + * >= NVTX_EVENT_SCOPE_ID_STATIC_START, and + * < NVTX_EVENT_SCOPE_ID_DYNAMIC_START */ + nvtxwResultCode_t (*EventScopeRegister)( + nvtxwStreamHandle_t stream, + const nvtxwEventScopeAttributes_t* attr); + + /* Register a schema ID to represent a schema, which describes the + * binary layout of a payload. + * Static schema ID must be provided, unique within the domain, + * >= NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START, and + * < NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_DYNAMIC_START */ + nvtxwResultCode_t (*SchemaRegister)( + nvtxwStreamHandle_t stream, + const nvtxPayloadSchemaAttr_t* attr); + + /* Register a schema ID to represent an enum type, including the + * mapping between its values and their name strings. + * Static schema ID must be provided, unique within the domain, + >= NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START, and + < NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_DYNAMIC_START */ + nvtxwResultCode_t (*EnumRegister)( + nvtxwStreamHandle_t stream, + const nvtxPayloadEnumAttr_t* attr); + + /* Write a batch of payloads into the stream representing one or more + * events. A logical event with multiple payloads cannot be broken up + * across multiple calls to EventWrite. The schema definitions for + * the payloads dictate how they are interpreted as events. */ + nvtxwResultCode_t (*EventWrite)( + nvtxwStreamHandle_t stream, + const nvtxPayloadData_t* payloads, + size_t payloadCount); + +} nvtxwInterfaceCore_t; + +#endif diff --git a/src/main/cpp/profiler/nvtxw_events.h b/src/main/cpp/profiler/nvtxw_events.h new file mode 100644 index 0000000000..3b46c1c989 --- /dev/null +++ b/src/main/cpp/profiler/nvtxw_events.h @@ -0,0 +1,17 @@ +#pragma once + +#include "nvtxw3.h" +#include "NvtxwEvents.h" +#include + +extern bool createNvtxwStream(const nvtxwInterfaceCore_t *nvtxwInterface, + const nvtxwSessionHandle_t& session, + const std::string & name, + const std::string & domain, + nvtxwStreamHandle_t & stream); + +extern int initialize_nvtxw(std::ifstream& in, const std::string& outName, + void *& nvtxwModuleHandle, + nvtxwInterfaceCore_t *&nvtxwInterface, + nvtxwSessionHandle_t &session, + nvtxwStreamHandle_t &stream); diff --git a/src/main/cpp/profiler/spark_rapids_profile_converter.cpp b/src/main/cpp/profiler/spark_rapids_profile_converter.cpp index b916020392..77f5a3b4aa 100644 --- a/src/main/cpp/profiler/spark_rapids_profile_converter.cpp +++ b/src/main/cpp/profiler/spark_rapids_profile_converter.cpp @@ -50,11 +50,12 @@ extern char const* Profiler_Schema; struct program_options { std::optional output_path; - bool help = false; - bool json = false; - bool nvtxt = false; - int json_indent = 2; - bool version = false; + bool help = false; + bool json = false; + bool nvtxt = false; + bool nvtxw = false; + int json_indent = 2; + bool version = false; }; struct event { @@ -114,6 +115,7 @@ Converts the spark-rapids profile in profile.bin into other forms. -i, --json-indent=INDENT indentation to use for JSON. 0 is no indent, less than 0 also removes newlines -o, --output=PATH use PATH as the output filename -t. --nvtxt convert to NVTXT, default output is stdout + -w. --nvtxw generate nsys-rep using NVTXW API -V, --version print the version number )" << std::endl; } @@ -179,13 +181,32 @@ std::pair> parse_options( ++argp; } } else if (*argp == "-j" || *argp == "--json") { - if (opts.nvtxt) { throw std::runtime_error("JSON and NVTXT output are mutually exclusive"); } + if (opts.nvtxt) { + throw std::runtime_error("JSON and NVTXT output are mutually exclusive"); + } + if (opts.nvtxw) { + throw std::runtime_error("JSON and NVTXW output are mutually exclusive"); + } opts.json = true; ++argp; } else if (*argp == "-t" || *argp == "--nvtxt") { - if (opts.json) { throw std::runtime_error("JSON and NVTXT output are mutually exclusive"); } + if (opts.json) { + throw std::runtime_error("JSON and NVTXT output are mutually exclusive"); + } + if (opts.nvtxw) { + throw std::runtime_error("NVTXT and NVTXW output are mutually exclusive"); + } opts.nvtxt = true; ++argp; + } else if (*argp == "-w" || *argp == "--nvtxw") { + if (opts.json) { + throw std::runtime_error("JSON and NVTXW output are mutually exclusive"); + } + if (opts.nvtxt) { + throw std::runtime_error("NVTXT and NVTXW output are mutually exclusive"); + } + opts.nvtxw = true; + ++argp; } else if (*argp == "-V" || *argp == "--version") { opts.version = true; ++argp; @@ -687,12 +708,385 @@ void convert_to_nvtxt(std::ifstream& in, std::ostream& out, program_options cons } } +#include "nvtxw_events.h" + +void convert_to_nvtxw(std::ifstream& in, nvtxwInterfaceCore_t *&nvtxwInterface, + nvtxwSessionHandle_t& session, + nvtxwStreamHandle_t& stream, + program_options const& opts) +{ + nvtxwResultCode_t result = NVTXW3_RESULT_SUCCESS; + int errorCode = 0; + struct marker_start { + uint64_t timestamp; + uint32_t process_id; + uint32_t thread_id; + uint32_t color; + uint32_t category; + std::string name; + std::string domain; + }; + std::unordered_map marker_data_map; + std::unordered_map marker_start_map; + std::unordered_map domainToStreamMap; + size_t num_dropped_records = 0; + uint32_t api_process_id = 0; + while (!in.eof()) { + auto fb_ptr = read_flatbuffer(in); + auto records = validate_fb(*fb_ptr, "ActivityRecords"); + auto dropped = records->dropped(); + if (dropped != nullptr) { + for (int i = 0; i < dropped->size(); ++i) { + auto d = dropped->Get(i); + num_dropped_records += d->num_dropped(); + } + } + auto api = records->api(); + if (api != nullptr) { + NvidiaNvtxw::cuptiApiEvent event; + for (int i = 0; i < api->size(); ++i) { + auto a = api->Get(i); + event.time_start = a->start(); + event.time_stop = a->end(); + event.kind = a->kind() + 1; + event.cbid = a->cbid(); + event.process_id = a->process_id(); + if (api_process_id == 0) { + api_process_id = a->process_id() & 0xffffff; + } + event.thread_id = a->thread_id() & 0xffffff; + event.correlation_id = a->correlation_id(); + event.return_value = a->return_value(); + nvtxPayloadData_t payloadData[] = { + {NvidiaNvtxw::PayloadSchemaId::cuptiApiId, sizeof(event), &event}, + }; + result = nvtxwInterface->EventWrite(stream, payloadData, std::extent::value); + if (result != NVTXW3_RESULT_SUCCESS) + { + fprintf(stderr, "API EventWrite failed with code %d\n", (int)result); + errorCode |= 4; + } + } + } + auto device = records->device(); + if (device != nullptr) { + NvidiaNvtxw::cuptiDevice event; + for(int i = 0; i < device->size(); ++i) { + auto d = device->Get(i); + event.global_memory_bandwidth = d->global_memory_bandwidth(); + event.global_memory_size = d->global_memory_size(); + event.constant_memory_size = d->constant_memory_size(); + event.l2_cache_size = d->l2_cache_size(); + event.num_threads_per_warp = d->num_threads_per_warp(); + event.core_clock_rate = d->core_clock_rate(); + event.num_memcpy_engines = d->num_memcpy_engines(); + event.num_multiprocessors = d->num_multiprocessors(); + event.max_ipc = d->max_ipc(); + event.max_warps_per_multiprocessor = d->max_warps_per_multiprocessor(); + event.max_blocks_per_multiprocessor = d->max_blocks_per_multiprocessor(); + event.max_shared_memory_per_multiprocessor = d->max_shared_memory_per_multiprocessor(); + event.max_registers_per_multiprocessor = d->max_registers_per_multiprocessor(); + event.max_registers_per_block = d->max_registers_per_block(); + event.max_shared_memory_per_block = d->max_shared_memory_per_block(); + event.max_threads_per_block = d->max_threads_per_block(); + event.max_block_dim_x = d->max_block_dim_x(); + event.max_block_dim_y = d->max_block_dim_y(); + event.max_block_dim_z = d->max_block_dim_z(); + event.max_grid_dim_x = d->max_grid_dim_x(); + event.max_grid_dim_y = d->max_grid_dim_y(); + event.max_grid_dim_z = d->max_grid_dim_z(); + event.compute_capability_major = d->compute_capability_major(); + event.compute_capability_minor = d->compute_capability_minor(); + event.id = d->id(); + event.ecc_enabled = d->ecc_enabled(); + event.name = d->name()->c_str(); + nvtxPayloadData_t payloadData[] = { + {NvidiaNvtxw::PayloadSchemaId::nameId, strlen(event.name)+1, event.name}, + {NvidiaNvtxw::PayloadSchemaId::cuptiDeviceId, sizeof(event), &event}, + }; + result = nvtxwInterface->EventWrite(stream, payloadData, std::extent::value); + if (result != NVTXW3_RESULT_SUCCESS) + { + fprintf(stderr, "Cupti Device EventWrite failed with code %d\n", (int)result); + errorCode |= 4; + } + } + } + auto marker_data = records->marker_data(); + if (marker_data != nullptr) { + for (int i = 0; i < marker_data->size(); ++i) { + auto m = marker_data->Get(i); + auto [it, inserted] = marker_data_map.insert({m->id(), m}); + if (not inserted) { + std::ostringstream oss; + oss << "duplicate marker data for " << m->id(); + throw std::runtime_error(oss.str()); + } + } + } + auto marker = records->marker(); + if (marker != nullptr) { + nvtxwStreamHandle_t nvtxStream; + for (int i = 0; i < marker->size(); ++i) { + auto m = marker->Get(i); + auto object_id = m->object_id(); + if (object_id != nullptr) { + uint32_t process_id = object_id->process_id(); + uint32_t thread_id = object_id->thread_id(); + if (m->flags() & spark_rapids_jni::profiler::MarkerFlags_Start) { + auto it = marker_data_map.find(m->id()); + uint32_t color = 0x444444; + uint32_t category = 0; + if (it != marker_data_map.end()) { + color = it->second->color(); + category = it->second->category(); + } + marker_start ms{m->timestamp(), process_id, thread_id, color, category, m->name()->str(), m->domain()->str()}; + auto [ignored, inserted] = marker_start_map.insert({m->id(), ms}); + if (not inserted) { + std::ostringstream oss; + oss << "duplicate marker start for ID " << m->id(); + throw std::runtime_error(oss.str()); + } + } else if (m->flags() & spark_rapids_jni::profiler::MarkerFlags_End) { + auto it = marker_start_map.find(m->id()); + if (it != marker_start_map.end()) { + auto const& ms = it->second; + // use default stream unless nvtx range has a domain + nvtxStream = stream; + std::string domainStr(ms.domain); + if (!domainStr.empty()) + { + auto domainStreamIt = domainToStreamMap.find(domainStr); + if (domainStreamIt != domainToStreamMap.end()) + { + // reuse existing stream for this domain + nvtxStream = domainStreamIt->second; + } + else + { + // create a new stream for this domain + bool valid = createNvtxwStream(nvtxwInterface, session, domainStr, domainStr, nvtxStream); + if (valid) + { + domainToStreamMap[domainStr] = nvtxStream; + } + else + { + fprintf(stderr, "createNvtxwStream failed for domain %s\n", domainStr.c_str()); + nvtxStream = stream; + errorCode |= 1; + } + } + } + NvidiaNvtxw::nvtxRangeEvent event; + event.time_start = ms.timestamp; + event.time_stop = m->timestamp(); + event.name = ms.name.c_str(); + event.process_id = ms.process_id & 0xffffff; + event.thread_id = ms.thread_id & 0xffffff; + event.color = ms.color; + nvtxPayloadData_t payloadData[] = { + {NvidiaNvtxw::PayloadSchemaId::nameId, strlen(event.name)+1, event.name}, + {NvidiaNvtxw::PayloadSchemaId::nvtxRangePushPopId, sizeof(event), &event}, + }; + result = nvtxwInterface->EventWrite(nvtxStream, payloadData, std::extent::value); + if (result != NVTXW3_RESULT_SUCCESS) + { + fprintf(stderr, "NvtxRange EventWrite failed with code %d\n", (int)result); + errorCode |= 4; + } + marker_start_map.erase(it); + } else { + std::cerr << "Ignoring marker end without start for ID " << m->id() << std::endl; + } + } else { + std::cerr << "Ignoring marker with unsupported flags: " << m->flags() << std::endl; + } + } else { + std::cerr << "Marker " << m->id() << " has no object ID" << std::endl; + } + } + } + marker_data_map.clear(); + auto kernel = records->kernel(); + if (kernel != nullptr) { + NvidiaNvtxw::cuptiKernelEvent event; + for (int i = 0; i < kernel->size(); ++i) { + auto k = kernel->Get(i); + event.time_start = k->start(); + event.time_stop = k->end(); + event.completed = k->completed(); + event.grid_id = k->grid_id(); + event.queued = k->queued(); + event.submitted = k->submitted(); + event.graph_node_id = k->graph_node_id(); + event.local_memory_total_v2 = k->local_memory_total_v2(); + event.name = k->name()->c_str(); + event.device_id = k->device_id(); + event.context_id = k->context_id(); + event.stream_id = k->stream_id(); + event.process_id = api_process_id; + event.grid_x = k->grid_x(); + event.grid_y = k->grid_y(); + event.grid_z = k->grid_z(); + event.block_x = k->block_x(); + event.block_y = k->block_y(); + event.block_z = k->block_z(); + event.static_shared_memory = k->static_shared_memory(); + event.dynamic_shared_memory = k->dynamic_shared_memory(); + event.local_memory_per_thread = k->local_memory_per_thread(); + event.local_memory_total = k->local_memory_total(); + event.correlation_id = k->correlation_id(); + event.shared_memory_executed = k->shared_memory_executed(); + event.graph_id = k->graph_id(); + event.channel_id = k->channel_id(); + event.cluster_x = k->cluster_x(); + event.cluster_y = k->cluster_y(); + event.cluster_z = k->cluster_z(); + event.cluster_scheduling_policy = k->cluster_scheduling_policy(); + event.registers_per_thread = k->registers_per_thread(); + event.requested = k->requested(); + event.executed = k->executed(); + event.shared_memory_config = k->shared_memory_config(); + event.partitioned_global_cache_requested = k->partitioned_global_cache_requested(); + event.partitioned_global_cache_executed = k->partitioned_global_cache_executed(); + event.launch_type = k->launch_type(); + event.is_shared_memory_carveout_requested = k->is_shared_memory_carveout_requested(); + event.shared_memory_carveout_requested = k->shared_memory_carveout_requested(); + event.shmem_limit_config = k->shmem_limit_config(); + event.channel_type = k->channel_type(); + nvtxPayloadData_t payloadData[] = { + {NvidiaNvtxw::PayloadSchemaId::nameId, strlen(event.name)+1, event.name}, + {NvidiaNvtxw::PayloadSchemaId::cuptiKernelId, sizeof(event), &event}, + }; + result = nvtxwInterface->EventWrite(stream, payloadData, std::extent::value); + if (result != NVTXW3_RESULT_SUCCESS) + { + fprintf(stderr, "Kernel EventWrite failed with code %d\n", (int)result); + errorCode |= 4; + } + } + } + auto memcpy = records->memcpy(); + if (memcpy != nullptr) { + NvidiaNvtxw::cuptiMemcpyEvent event; + for (int i = 0; i < memcpy->size(); ++i) { + auto m = memcpy->Get(i); + event.time_start = m->start(); + event.time_stop = m->end(); + event.bytes = m->bytes(); + event.graph_node_id = 0; + event.device_id = m->device_id(); + event.context_id = m->context_id(); + event.stream_id = m->stream_id(); + event.process_id = api_process_id; + event.correlation_id = m->correlation_id(); + event.runtime_correlation_id = m->runtime_correlation_id(); + event.graph_id = 0; + event.channel_id = m->channel_id(); + event.copy_kind = m->copy_kind(); + event.src_kind = m->src_kind(); + event.dst_kind = m->dst_kind(); + event.channelType = m->channel_type(); + nvtxPayloadData_t payloadData[] = { + {NvidiaNvtxw::PayloadSchemaId::cuptiMemcpyId, sizeof(event), &event}, + }; + result = nvtxwInterface->EventWrite(stream, payloadData, std::extent::value); + if (result != NVTXW3_RESULT_SUCCESS) + { + fprintf(stderr, "Memcpy EventWrite failed with code %d\n", (int)result); + errorCode |= 4; + } + } + } + auto memset = records->memset(); + if (memset != nullptr) { + NvidiaNvtxw::cuptiMemsetEvent event; + for (int i = 0; i < memset->size(); ++i) { + auto m = memset->Get(i); + event.time_start = m->start(); + event.time_stop = m->end(); + event.bytes = m->bytes(); + event.graph_node_id = 0; + event.device_id = m->device_id(); + event.context_id = m->context_id(); + event.stream_id = m->stream_id(); + event.process_id = api_process_id; + event.correlation_id = m->correlation_id(); + event.graph_id = 0; + event.channel_id = m->channel_id(); + event.value = m->value(); + event.mem_kind = m->memory_kind(); + event.flags = m->flags(); + event.channelType = m->channel_type(); + nvtxPayloadData_t payloadData[] = { + {NvidiaNvtxw::PayloadSchemaId::cuptiMemsetId, sizeof(event), &event}, + }; + result = nvtxwInterface->EventWrite(stream, payloadData, std::extent::value); + if (result != NVTXW3_RESULT_SUCCESS) + { + fprintf(stderr, "Memset EventWrite failed with code %d\n", (int)result); + errorCode |= 4; + } + } + } + auto overhead = records->overhead(); + if (overhead != nullptr) { + NvidiaNvtxw::cuptiOverheadEvent event; + for (int i = 0; i < overhead->size(); ++i) { + auto o = overhead->Get(i); + auto object_id = o->object_id(); + if (object_id != nullptr) { + event.time_start = o->start(); + event.time_stop = o->end(); + event.process_id = object_id->process_id() & 0xffffff; + event.thread_id = object_id->thread_id() & 0xffffff; + event.overhead_kind = o->overhead_kind(); + nvtxPayloadData_t payloadData[] = { + {NvidiaNvtxw::PayloadSchemaId::cuptiOverheadId, sizeof(event), &event}, + }; + result = nvtxwInterface->EventWrite(stream, payloadData, std::extent::value); + if (result != NVTXW3_RESULT_SUCCESS) + { + fprintf(stderr, "Overhead EventWrite failed with code %d\n", (int)result); + errorCode |= 4; + } + } else { + std::cerr << "Overhead activity has no object ID" << std::endl; + } + } + } + in.peek(); + } + if (num_dropped_records) { + std::cerr << "Warning: " << num_dropped_records + << " records were noted as dropped in the profile" << std::endl; + } + for(auto it : domainToStreamMap) + { + result = nvtxwInterface->StreamClose(it.second); + if (result != NVTXW3_RESULT_SUCCESS) + { + fprintf(stderr, "StreamClose failed for domain %s with code %d\n", it.first.c_str(), (int)result); + errorCode |= 8; + } + } + result = nvtxwInterface->StreamClose(stream); + if (result != NVTXW3_RESULT_SUCCESS) + { + fprintf(stderr, "StreamClose failed with code %d\n", (int)result); + errorCode |= 8; + } +} + int main(int argc, char* argv[]) { constexpr int RESULT_SUCCESS = 0; constexpr int RESULT_FAILURE = 1; constexpr int RESULT_USAGE = 2; program_options opts; + int errorCode = 0; std::vector files; if (argc < 2) { print_usage(); @@ -740,6 +1134,24 @@ int main(int argc, char* argv[]) } else { convert_to_nvtxt(in, std::cout, opts); } + } else if (opts.nvtxw) { + if (opts.output_path) { + void * nvtxwModuleHandle = nullptr; + nvtxwInterfaceCore_t *nvtxwInterface = nullptr; + nvtxwSessionHandle_t session; + nvtxwStreamHandle_t stream; + errorCode = initialize_nvtxw(in, opts.output_path.value().stem(), nvtxwModuleHandle, nvtxwInterface, session, stream); + if (errorCode == 0) { + convert_to_nvtxw(in, nvtxwInterface, session, stream, opts); + nvtxwResultCode_t result = nvtxwInterface->SessionEnd(session); + if (result != NVTXW3_RESULT_SUCCESS) + { + fprintf(stderr, "SessionEnd failed with code %d\n", (int)result); + return RESULT_FAILURE; + } + } + nvtxwUnload(nvtxwModuleHandle); + } } else { convert_to_nsys_rep(in, input_file, opts); }