Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(tracing): add nvtx provider #363

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ CHECK_GCC_BUILTIN([__builtin_ffsll])
# Checks for external packages
CHECK_PKG_LIBFABRIC([], [AC_MSG_ERROR([NCCL OFI Plugin could not find a working Libfabric install.])])

CHECK_PKG_NVTX()
CHECK_PKG_LTTNG()

have_device_interface=no
Expand Down
4 changes: 3 additions & 1 deletion include/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@ noinst_HEADERS = \
nccl_ofi_topo.h \
nccl_ofi_tuner.h \
nccl_ofi_ofiutils.h \
tracepoint.h \
nccl_ofi_tracepoint.h \
tracing_impl/lttng.h \
tracing_impl/nvtx.h \
nccl-headers/net.h \
nccl-headers/error.h \
nccl-headers/nvidia/err.h \
Expand Down
74 changes: 74 additions & 0 deletions include/nccl_ofi_tracepoint.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/*
* Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All rights reserved.
*/


#pragma once
rauteric marked this conversation as resolved.
Show resolved Hide resolved

#include "config.h"
#include "tracing_impl/nvtx.h"
#include "tracing_impl/lttng.h"

#define NCCL_OFI_TRACE_SEND(dev, size, comm, msg_seq_num, request, nccl_req) do { \
lttng_ust_tracepoint(nccl_ofi_plugin, Send, dev, size, comm, msg_seq_num, request, nccl_req); \
nvtx_push("Send"); \
} while(0)

#define NCCL_OFI_TRACE_SEND_CTRL_RECV(dev, rail_id, comm, msg_seq_num) do { \
lttng_ust_tracepoint(nccl_ofi_plugin, Send_ctrl_recv, dev, rail_id, comm, msg_seq_num); \
nvtx_push("Send_ctrl_recv"); \
} while (0)

#define NCCL_OFI_TRACE_SEND_WRITE_SEG_START(dev, rail_id, size, comm, msg_seq_num, request) do { \
lttng_ust_tracepoint(nccl_ofi_plugin, Send_write_segment_start, dev, rail_id, size, comm, msg_seq_num, request); \
nvtx_push("Send_write_segment_start"); \
} while(0)

#define NCCL_OFI_TRACE_SEND_WRITE_SEG_COMPLETE(dev, rail_id, comm, msg_seq_num, request) do { \
lttng_ust_tracepoint(nccl_ofi_plugin, Send_write_segment_complete, dev, rail_id, comm, msg_seq_num, request); \
nvtx_push("Send_write_segment_complete"); \
} while(0)

#define NCCL_OFI_TRACE_RECV(dev, tag, size, request, nccl_req) do { \
lttng_ust_tracepoint(nccl_ofi_plugin, Recv, dev, tag, size, request, nccl_req); \
nvtx_push("Recv"); \
} while(0)

#define NCCL_OFI_TRACE_RECV_CTRL_SEND_COMPLETE(request) do { \
lttng_ust_tracepoint(nccl_ofi_plugin, Recv_ctrl_send_complete, request); \
nvtx_push("Recv_ctrl_send_complete"); \
} while(0)

#define NCCL_OFI_TRACE_RECV_SEGMENT_COMPLETE(dev, rail_id, size, request) do { \
lttng_ust_tracepoint(nccl_ofi_plugin, Recv_segment_complete, dev, rail_id, size, request); \
nvtx_push("Recv_segment_complete"); \
} while(0)

#define NCCL_OFI_TRACE_EAGER_RECV(dev, rail_id, comm, msg_seq_num) do { \
lttng_ust_tracepoint(nccl_ofi_plugin, Eager_recv, dev, rail_id, comm, msg_seq_num); \
nvtx_push("Eager_recv"); \
} while(0)

#define NCCL_OFI_TRACE_COMPLETIONS(request,ctx) do { \
lttng_ust_tracepoint(nccl_ofi_plugin, ProcessCompletions, request,ctx); \
nvtx_push("ProcessCompletions"); \
} while(0)

#define NCCL_OFI_TRACE_FLUSH(request, nccl_req) do { \
lttng_ust_tracepoint(nccl_ofi_plugin, Flush, request, nccl_req); \
nvtx_push("Flush"); \
} while(0)

#define NCCL_OFI_TRACE_PENDING_INSERT(request) do { \
lttng_ust_tracepoint(nccl_ofi_plugin, Pending_queue_insert, request); \
nvtx_push("Pending_queue_insert"); \
} while(0)

#define NCCL_OFI_TRACE_PENDING_REMOVE(request) do { \
lttng_ust_tracepoint(nccl_ofi_plugin, Pending_queue_remove, request); \
nvtx_push("Pending_queue_remove"); \
} while(0)

#define NCCL_OFI_TRACE_POP(...) do { \
nvtx_pop(); \
} while(0)
71 changes: 28 additions & 43 deletions include/tracepoint.h → include/tracing_impl/lttng.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,15 @@
* Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All rights reserved.
*/

#pragma once

#if HAVE_LTTNG_UST

#undef LTTNG_UST_TRACEPOINT_PROVIDER
#define LTTNG_UST_TRACEPOINT_PROVIDER nccl_ofi_plugin

#undef LTTNG_UST_TRACEPOINT_INCLUDE
#define LTTNG_UST_TRACEPOINT_INCLUDE "include/tracepoint.h"
#define LTTNG_UST_TRACEPOINT_INCLUDE "include/tracing_impl/lttng.h"

/*
* To add a tracepoint at the nccl_ofi_plugin layer:
Expand All @@ -28,11 +32,10 @@
* tracing output, and arguments <arg1> and <arg2> with <name1> and
* <name2> will appear in that trace as data.
*
* Add a macro to the top level tracing.h
rauteric marked this conversation as resolved.
Show resolved Hide resolved
*
*/

#include "config.h"
#if HAVE_LIBLTTNG_UST == 1

/*
* LTTNG_UST_TRACEPOINT_HEADER_MULTI_READ must be included so that the tracepoints
* can be defined and compiled from tracepoint.c, and so they can be referenced
Expand Down Expand Up @@ -65,8 +68,8 @@ LTTNG_UST_TRACEPOINT_EVENT(
lttng_ust_field_integer_hex(uint64_t, nccl_req, (uint64_t)nccl_req)
)
)
#define NCCL_OFI_TRACE_SEND(dev, size, comm, msg_seq_num, request, nccl_req) \
lttng_ust_tracepoint(nccl_ofi_plugin, Send, dev, size, comm, msg_seq_num, request, nccl_req)



LTTNG_UST_TRACEPOINT_EVENT(
nccl_ofi_plugin,
Expand All @@ -84,8 +87,9 @@ LTTNG_UST_TRACEPOINT_EVENT(
lttng_ust_field_integer(uint16_t, msg_seq_num, msg_seq_num)
)
)
#define NCCL_OFI_TRACE_SEND_CTRL_RECV(dev, rail_id, comm, msg_seq_num) \
lttng_ust_tracepoint(nccl_ofi_plugin, Send_ctrl_recv, dev, rail_id, comm, msg_seq_num)




LTTNG_UST_TRACEPOINT_EVENT(
nccl_ofi_plugin,
Expand All @@ -107,8 +111,8 @@ LTTNG_UST_TRACEPOINT_EVENT(
lttng_ust_field_integer_hex(uint64_t, request, (uint64_t)request)
)
)
#define NCCL_OFI_TRACE_SEND_WRITE_SEG_START(dev, rail_id, size, comm, msg_seq_num, request) \
lttng_ust_tracepoint(nccl_ofi_plugin, Send_write_segment_start, dev, rail_id, size, comm, msg_seq_num, request)



LTTNG_UST_TRACEPOINT_EVENT(
nccl_ofi_plugin,
Expand All @@ -128,8 +132,8 @@ LTTNG_UST_TRACEPOINT_EVENT(
lttng_ust_field_integer_hex(uint64_t, request, (uint64_t)request)
)
)
#define NCCL_OFI_TRACE_SEND_WRITE_SEG_COMPLETE(dev, rail_id, comm, msg_seq_num, request) \
lttng_ust_tracepoint(nccl_ofi_plugin, Send_write_segment_complete, dev, rail_id, comm, msg_seq_num, request)



LTTNG_UST_TRACEPOINT_EVENT(
nccl_ofi_plugin,
Expand All @@ -149,8 +153,8 @@ LTTNG_UST_TRACEPOINT_EVENT(
lttng_ust_field_integer_hex(uint64_t, nccl_req, (uint64_t)nccl_req)
)
)
#define NCCL_OFI_TRACE_RECV(dev, tag, size, request, nccl_req) \
lttng_ust_tracepoint(nccl_ofi_plugin, Recv, dev, tag, size, request, nccl_req)



LTTNG_UST_TRACEPOINT_EVENT(
nccl_ofi_plugin,
Expand All @@ -162,8 +166,8 @@ LTTNG_UST_TRACEPOINT_EVENT(
lttng_ust_field_integer_hex(uint64_t, request, (uint64_t)request)
)
)
#define NCCL_OFI_TRACE_RECV_CTRL_SEND_COMPLETE(request) \
lttng_ust_tracepoint(nccl_ofi_plugin, Recv_ctrl_send_complete, request)



LTTNG_UST_TRACEPOINT_EVENT(
nccl_ofi_plugin,
Expand All @@ -181,8 +185,7 @@ LTTNG_UST_TRACEPOINT_EVENT(
lttng_ust_field_integer_hex(uint64_t, request, (uint64_t)request)
)
)
#define NCCL_OFI_TRACE_RECV_SEGMENT_COMPLETE(dev, rail_id, size, request) \
lttng_ust_tracepoint(nccl_ofi_plugin, Recv_segment_complete, dev, rail_id, size, request)


LTTNG_UST_TRACEPOINT_EVENT(
nccl_ofi_plugin,
Expand All @@ -200,8 +203,7 @@ LTTNG_UST_TRACEPOINT_EVENT(
lttng_ust_field_integer(uint16_t, msg_seq_num, msg_seq_num)
)
)
#define NCCL_OFI_TRACE_EAGER_RECV(dev, rail_id, comm, msg_seq_num) \
lttng_ust_tracepoint(nccl_ofi_plugin, Eager_recv, dev, rail_id, comm, msg_seq_num)


LTTNG_UST_TRACEPOINT_EVENT(
nccl_ofi_plugin,
Expand All @@ -215,8 +217,8 @@ LTTNG_UST_TRACEPOINT_EVENT(
lttng_ust_field_integer(uint64_t, ctx, (uint64_t)ctx)
)
)
#define NCCL_OFI_TRACE_COMPLETIONS(request,ctx) \
lttng_ust_tracepoint(nccl_ofi_plugin, ProcessCompletions, request,ctx)



LTTNG_UST_TRACEPOINT_EVENT(
nccl_ofi_plugin,
Expand All @@ -230,8 +232,7 @@ LTTNG_UST_TRACEPOINT_EVENT(
lttng_ust_field_integer_hex(uint64_t, nccl_req, (uint64_t)nccl_req)
)
)
#define NCCL_OFI_TRACE_FLUSH(request, nccl_req) \
lttng_ust_tracepoint(nccl_ofi_plugin, Flush, request, nccl_req)


LTTNG_UST_TRACEPOINT_EVENT(
nccl_ofi_plugin,
Expand All @@ -243,8 +244,7 @@ LTTNG_UST_TRACEPOINT_EVENT(
lttng_ust_field_integer_hex(uint64_t, request, (uint64_t)request)
)
)
#define NCCL_OFI_TRACE_PENDING_INSERT(request) \
lttng_ust_tracepoint(nccl_ofi_plugin, Pending_queue_insert, request)


LTTNG_UST_TRACEPOINT_EVENT(
nccl_ofi_plugin,
Expand All @@ -256,26 +256,11 @@ LTTNG_UST_TRACEPOINT_EVENT(
lttng_ust_field_integer_hex(uint64_t, request, (uint64_t)request)
)
)
#define NCCL_OFI_TRACE_PENDING_REMOVE(request) \
lttng_ust_tracepoint(nccl_ofi_plugin, Pending_queue_remove, request)

#endif /* NCCL_OFI_TRACEPOINT_H */

#include <lttng/tracepoint-event.h>

#else

#define NCCL_OFI_TRACE_SEND(...)
#define NCCL_OFI_TRACE_SEND_CTRL_RECV(...)
#define NCCL_OFI_TRACE_SEND_WRITE_SEG_START(...)
#define NCCL_OFI_TRACE_SEND_WRITE_SEG_COMPLETE(...)
#define NCCL_OFI_TRACE_RECV(...)
#define NCCL_OFI_TRACE_RECV_CTRL_SEND_COMPLETE(...)
#define NCCL_OFI_TRACE_RECV_SEGMENT_COMPLETE(...)
#define NCCL_OFI_TRACE_EAGER_RECV(...)
#define NCCL_OFI_TRACE_FLUSH(...)
#define NCCL_OFI_TRACE_PENDING_INSERT(...)
#define NCCL_OFI_TRACE_PENDING_REMOVE(...)
#define NCCL_OFI_TRACE_COMPLETIONS(...)

#endif // HAVE_LIBLTTNG_UST
#define lttng_ust_tracepoint(...)
#endif
25 changes: 25 additions & 0 deletions include/tracing_impl/nvtx.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/*
* Copyright (c) 2022-2024 Amazon.com, Inc. or its affiliates. All rights reserved.
*/

#pragma once
#if HAVE_NVTX_TRACING
#include "nvToolsExt.h"
static inline void nvtx_push(const char* name) {
const nvtxEventAttributes_t eventAttrib = {
.version = NVTX_VERSION,
.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE,
.colorType = NVTX_COLOR_ARGB,
.color = 0xeb9234,
.messageType = NVTX_MESSAGE_TYPE_ASCII,
.message = { .ascii = name },
};
nvtxRangePushEx(&eventAttrib);
}
static inline void nvtx_pop(void) {
nvtxRangePop();
}
#else
static inline void nvtx_push(const char* name){ (void)name; }
static inline void nvtx_pop(void){}
rauteric marked this conversation as resolved.
Show resolved Hide resolved
#endif
2 changes: 1 addition & 1 deletion m4/check_pkg_lttng.m4
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ AC_DEFUN([CHECK_PKG_LTTNG], [
LIBS="${check_pkg_LIBS_save}"
$2])

AC_DEFINE_UNQUOTED([HAVE_CUDA], [${check_pkg_define}], [Defined to 1 if CUDA is available])
AC_DEFINE_UNQUOTED([HAVE_LIBLTTNG_UST], [${check_pkg_found}], [Defined to 1 if lttng-ust is requested and available])

AS_UNSET([check_pkg_found])
AS_UNSET([check_pkg_define])
Expand Down
46 changes: 46 additions & 0 deletions m4/check_pkg_nvtx.m4
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# -*- autoconf -*-
#
# Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All rights reserved.
#
# See LICENSE.txt for license information
#

AC_DEFUN([CHECK_PKG_NVTX], [
check_pkg_found=yes

check_pkg_CPPFLAGS_save="${CPPFLAGS}"
check_pkg_LDFLAGS_save="${LDFLAGS}"
check_pkg_LIBS_save="${LIBS}"

AC_ARG_WITH([nvtx],
[AS_HELP_STRING([--with-nvtx=DIR], [Enable tracing capability with NVTX @<:@default=no@:>@])])

AS_IF([test -z "${with_nvtx}" -o "${with_nvtx}" = "yes"],
rauteric marked this conversation as resolved.
Show resolved Hide resolved
[],
[test "${with_nvtx}" = "no"],
[check_pkg_found=no],
[AS_IF([test -d ${with_nvtx}/lib64], [check_pkg_libdir="lib64"], [check_pkg_libdir="lib"])
CPPFLAGS="-I${with_nvtx}/include ${CPPFLAGS}"
LDFLAGS="-L${with_nvtx}/${check_pkg_libdir} ${LDFLAGS}"])

AS_IF([test "${check_pkg_found}" = "yes"],
[AC_CHECK_LIB([nvToolsExt], [nvtxRangePop], [], [check_pkg_found=no])])

AS_IF([test "${check_pkg_found}" = "yes"],
[check_pkg_define=1
$1],
[check_pkg_define=0
CPPFLAGS="${check_pkg_CPPFLAGS_save}"
LDFLAGS="${check_pkg_LDFLAGS_save}"
LIBS="${check_pkg_LIBS_save}"
$2])

AC_DEFINE_UNQUOTED([HAVE_NVTX_TRACING], [${check_pkg_define}], [Defined to 1 if NVTX is available])

AS_UNSET([check_pkg_found])
AS_UNSET([check_pkg_define])
AS_UNSET([check_pkg_libdir])
AS_UNSET([check_pkg_CPPFLAGS_save])
AS_UNSET([check_pkg_LDFLAGS_save])
AS_UNSET([check_pkg_LIBS_save])
])
2 changes: 1 addition & 1 deletion src/nccl_ofi_net.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

#include "nccl_ofi.h"
#include "nccl_ofi_param.h"
#include "tracepoint.h"
#include "nccl_ofi_tracepoint.h"
#if HAVE_CUDA
#include "nccl_ofi_cuda.h"
#endif
Expand Down
2 changes: 1 addition & 1 deletion src/nccl_ofi_ofiutils.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

#include "nccl_ofi.h"
#include "nccl_ofi_param.h"
#include "tracepoint.h"
#include "nccl_ofi_tracepoint.h"
#if HAVE_CUDA
#include "nccl_ofi_cuda.h"
#endif
Expand Down
Loading
Loading