Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

more performance counters #79

Merged
merged 1 commit into from
Dec 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions lci/backend/ibv/server_ibv.h
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,9 @@ static inline int LCISD_poll_cq(LCIS_endpoint_t endpoint_pp,
#ifdef LCI_ENABLE_MULTITHREAD_PROGRESS
if (!LCIU_try_acquire_spinlock(&endpoint_p->cq_lock)) return 0;
#endif
LCII_PCOUNTER_START(net_poll_cq_succeed_timer);
int ne = ibv_poll_cq(endpoint_p->cq, LCI_CQ_MAX_POLL, wc);
LCII_PCOUNTER_END(net_poll_cq_succeed_timer);
LCI_DBG_Assert(ne >= 0, "ibv_poll_cq returns error %d\n", ne);
#ifdef LCI_ENABLE_MULTITHREAD_PROGRESS
LCIU_release_spinlock(&endpoint_p->cq_lock);
Expand Down
44 changes: 44 additions & 0 deletions lci/backend/server.h
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,9 @@ static inline int LCIS_poll_cq(LCIS_endpoint_t endpoint_pp,
{
LCII_PCOUNTER_ADD(net_poll_cq_attempts, 1);
LCISI_CS_ENTER(endpoint_pp, 0);
LCII_PCOUNTER_NOW(start);
int ret = LCISD_poll_cq(endpoint_pp, entry);
LCII_PCOUNTER_SINCE(net_poll_cq_timer, start);
LCISI_CS_EXIT(endpoint_pp);
return ret;
}
Expand All @@ -182,8 +184,14 @@ static inline LCI_error_t LCIS_post_sends(LCIS_endpoint_t endpoint_pp, int rank,
#ifdef LCI_ENABLE_SLOWDOWN
LCIU_spin_for_nsec(LCI_SEND_SLOW_DOWN_USEC * 1000);
#endif

LCII_PCOUNTER_NOW(start);
LCI_error_t ret = LCISD_post_sends(endpoint_pp, rank, buf, size, meta);
LCII_PCOUNTER_NOW(end);
LCII_PCOUNTER_ADD(net_post_send_timer, end - start);

if (ret == LCI_OK) {
LCII_PCOUNTER_ADD(net_post_send_succeed_timer, end - start);
LCII_PCOUNTER_ADD(net_sends_posted, (int64_t)size);
} else if (ret == LCI_ERR_RETRY_LOCK) {
LCII_PCOUNTER_ADD(net_send_failed_lock, 1);
Expand All @@ -206,9 +214,15 @@ static inline LCI_error_t LCIS_post_send(LCIS_endpoint_t endpoint_pp, int rank,
#ifdef LCI_ENABLE_SLOWDOWN
LCIU_spin_for_nsec(LCI_SEND_SLOW_DOWN_USEC * 1000);
#endif

LCII_PCOUNTER_NOW(start);
LCI_error_t ret =
LCISD_post_send(endpoint_pp, rank, buf, size, mr, meta, ctx);
LCII_PCOUNTER_NOW(end);
LCII_PCOUNTER_ADD(net_post_send_timer, end - start);

if (ret == LCI_OK) {
LCII_PCOUNTER_ADD(net_post_send_succeed_timer, end - start);
LCII_PCOUNTER_ADD(net_send_posted, (int64_t)size);
} else if (ret == LCI_ERR_RETRY_LOCK) {
LCII_PCOUNTER_ADD(net_send_failed_lock, 1);
Expand All @@ -232,9 +246,15 @@ static inline LCI_error_t LCIS_post_puts(LCIS_endpoint_t endpoint_pp, int rank,
#ifdef LCI_ENABLE_SLOWDOWN
LCIU_spin_for_nsec(LCI_SEND_SLOW_DOWN_USEC * 1000);
#endif

LCII_PCOUNTER_NOW(start);
LCI_error_t ret =
LCISD_post_puts(endpoint_pp, rank, buf, size, base, offset, rkey);
LCII_PCOUNTER_NOW(end);
LCII_PCOUNTER_ADD(net_post_send_timer, end - start);

if (ret == LCI_OK) {
LCII_PCOUNTER_ADD(net_post_send_succeed_timer, end - start);
LCII_PCOUNTER_ADD(net_sends_posted, (int64_t)size);
} else if (ret == LCI_ERR_RETRY_LOCK) {
LCII_PCOUNTER_ADD(net_send_failed_lock, 1);
Expand All @@ -259,9 +279,15 @@ static inline LCI_error_t LCIS_post_put(LCIS_endpoint_t endpoint_pp, int rank,
#ifdef LCI_ENABLE_SLOWDOWN
LCIU_spin_for_nsec(LCI_SEND_SLOW_DOWN_USEC * 1000);
#endif

LCII_PCOUNTER_NOW(start);
LCI_error_t ret =
LCISD_post_put(endpoint_pp, rank, buf, size, mr, base, offset, rkey, ctx);
LCII_PCOUNTER_NOW(end);
LCII_PCOUNTER_ADD(net_post_send_timer, end - start);

if (ret == LCI_OK) {
LCII_PCOUNTER_ADD(net_post_send_succeed_timer, end - start);
LCII_PCOUNTER_ADD(net_send_posted, (int64_t)size);
} else if (ret == LCI_ERR_RETRY_LOCK) {
LCII_PCOUNTER_ADD(net_send_failed_lock, 1);
Expand All @@ -287,9 +313,15 @@ static inline LCI_error_t LCIS_post_putImms(LCIS_endpoint_t endpoint_pp,
#ifdef LCI_ENABLE_SLOWDOWN
LCIU_spin_for_nsec(LCI_SEND_SLOW_DOWN_USEC * 1000);
#endif

LCII_PCOUNTER_NOW(start);
LCI_error_t ret = LCISD_post_putImms(endpoint_pp, rank, buf, size, base,
offset, rkey, meta);
LCII_PCOUNTER_NOW(end);
LCII_PCOUNTER_ADD(net_post_send_timer, end - start);

if (ret == LCI_OK) {
LCII_PCOUNTER_ADD(net_post_send_succeed_timer, end - start);
LCII_PCOUNTER_ADD(net_send_posted, (int64_t)size);
} else if (ret == LCI_ERR_RETRY_LOCK) {
LCII_PCOUNTER_ADD(net_send_failed_lock, 1);
Expand All @@ -316,9 +348,15 @@ static inline LCI_error_t LCIS_post_putImm(LCIS_endpoint_t endpoint_pp,
#ifdef LCI_ENABLE_SLOWDOWN
LCIU_spin_for_nsec(LCI_SEND_SLOW_DOWN_USEC * 1000);
#endif

LCII_PCOUNTER_NOW(start);
LCI_error_t ret = LCISD_post_putImm(endpoint_pp, rank, buf, size, mr, base,
offset, rkey, meta, ctx);
LCII_PCOUNTER_NOW(end);
LCII_PCOUNTER_ADD(net_post_send_timer, end - start);

if (ret == LCI_OK) {
LCII_PCOUNTER_ADD(net_post_send_succeed_timer, end - start);
LCII_PCOUNTER_ADD(net_send_posted, (int64_t)size);
} else if (ret == LCI_ERR_RETRY_LOCK) {
LCII_PCOUNTER_ADD(net_send_failed_lock, 1);
Expand All @@ -337,8 +375,14 @@ static inline LCI_error_t LCIS_post_recv(LCIS_endpoint_t endpoint_pp, void* buf,
LCI_DBG_Log(LCI_LOG_TRACE, "server",
"LCIS_post_recv: buf %p size %u mr %p user_context %p\n", buf,
size, mr.mr_p, ctx);

LCII_PCOUNTER_NOW(start);
LCI_error_t ret = LCISD_post_recv(endpoint_pp, buf, size, mr, ctx);
LCII_PCOUNTER_NOW(end);
LCII_PCOUNTER_ADD(net_post_recv_timer, end - start);

if (ret == LCI_OK) {
LCII_PCOUNTER_ADD(net_post_recv_succeed_timer, end - start);
LCII_PCOUNTER_ADD(net_recv_posted, 1);
} else if (ret == LCI_ERR_RETRY_LOCK) {
LCII_PCOUNTER_ADD(net_recv_failed_lock, 1);
Expand Down
12 changes: 12 additions & 0 deletions lci/profile/performance_counter.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,12 @@ extern LCT_pcounter_ctx_t LCII_pcounter_ctx;
_macro(unexpected_msg)

#define LCII_PCOUNTER_TIMER_FOR_EACH(_macro) \
_macro(net_post_send_timer) \
_macro(net_post_send_succeed_timer) \
_macro(net_post_recv_timer) \
_macro(net_post_recv_succeed_timer) \
_macro(net_poll_cq_timer) \
_macro(net_poll_cq_succeed_timer) \
_macro(sync_stay_timer) \
_macro(cq_stay_timer) \
_macro(useful_progress_timer) \
Expand All @@ -55,7 +61,9 @@ extern LCT_pcounter_ctx_t LCII_pcounter_ctx;
_macro(cq_push_timer) \
_macro(cq_pop_timer) \
_macro(serve_rts_timer) \
_macro(handle_rts_timer) \
_macro(rts_mem_timer) \
_macro(rts_archive_timer) \
_macro(rts_send_timer) \
_macro(serve_rtr_timer) \
_macro(rtr_mem_reg_timer) \
Expand Down Expand Up @@ -87,13 +95,17 @@ LCII_PCOUNTER_TIMER_FOR_EACH(LCII_PCOUNTER_HANDLE_DECL)
LCT_pcounter_startt(LCII_pcounter_ctx, LCII_pcounter_handle_##name, time);
#define LCII_PCOUNTER_ENDT(name, time) \
LCT_pcounter_endt(LCII_pcounter_ctx, LCII_pcounter_handle_##name, time);
#define LCII_PCOUNTER_SINCE(name, time) \
LCT_pcounter_add(LCII_pcounter_ctx, LCII_pcounter_handle_##name, \
LCT_now() - time);
#else
#define LCII_PCOUNTER_ADD(name, val)
#define LCII_PCOUNTER_START(name)
#define LCII_PCOUNTER_END(name)
#define LCII_PCOUNTER_NOW(time)
#define LCII_PCOUNTER_STARTT(name, time)
#define LCII_PCOUNTER_ENDT(name, time)
#define LCII_PCOUNTER_SINCE(name, time)
#endif

void LCII_pcounters_init();
Expand Down
4 changes: 4 additions & 0 deletions lci/runtime/rendezvous.h
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@ static inline void LCII_handle_rts(LCI_endpoint_t ep, LCII_packet_t* packet,
int src_rank, uint16_t tag,
LCII_context_t* rdv_ctx, bool is_in_progress)
{
LCII_PCOUNTER_START(handle_rts_timer);
// Extract information from the received RTS packet
LCII_rdv_type_t rdv_type = packet->data.rts.rdv_type;
LCI_DBG_Log(LCI_LOG_TRACE, "rdv", "handle rts: rdv_type %d\n", rdv_type);
Expand Down Expand Up @@ -270,8 +271,10 @@ static inline void LCII_handle_rts(LCI_endpoint_t ep, LCII_packet_t* packet,
// We cannot use writeimm for more than 1 rdma messages.
// IOVEC does not support writeimm for now
uint64_t ctx_key;
LCII_PCOUNTER_START(rts_archive_timer);
int result =
LCM_archive_put(ep->ctx_archive_p, (uintptr_t)rdv_ctx, &ctx_key);
LCII_PCOUNTER_END(rts_archive_timer);
// TODO: be able to pass back pressure to user
LCI_Assert(result == LCM_SUCCESS, "Archive is full!\n");
packet->data.rtr.recv_ctx_key = ctx_key;
Expand Down Expand Up @@ -312,6 +315,7 @@ static inline void LCII_handle_rts(LCI_endpoint_t ep, LCII_packet_t* packet,
ep->device->heap_segment->mr,
LCII_MAKE_PROTO(ep->gid, LCI_MSG_RTR, 0), rtr_ctx);
LCII_PCOUNTER_END(rts_send_timer);
LCII_PCOUNTER_END(handle_rts_timer);
}

static inline void LCII_handle_rtr(LCI_endpoint_t ep, LCII_packet_t* packet)
Expand Down
10 changes: 5 additions & 5 deletions lct/pcounter/pcounter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,19 +109,19 @@ struct tls_ctx_t {
switch (handle.type) {
case LCT_PCOUNTER_COUNTER:
if (handle.idx >= counters.size()) {
counters.resize(handle.idx + 1);
counters.resize(handle.idx * 2 + 1);
}
counters[handle.idx].add(val);
break;
case LCT_PCOUNTER_TREND:
if (handle.idx >= trends.size()) {
trends.resize(handle.idx + 1);
trends.resize(handle.idx * 2 + 1);
}
trends[handle.idx].add(val);
break;
case LCT_PCOUNTER_TIMER:
if (handle.idx >= timers.size()) {
timers.resize(handle.idx + 1);
timers.resize(handle.idx * 2 + 1);
}
timers[handle.idx].add(val);
break;
Expand All @@ -141,7 +141,7 @@ struct tls_ctx_t {
", type: " + std::to_string(handle.type) + ", " +
std::to_string(handle.idx));
if (handle.idx >= timers.size()) {
timers.resize(handle.idx + 1);
timers.resize(handle.idx * 2 + 1);
}
timers[handle.idx].start(time);
lock.unlock();
Expand All @@ -155,7 +155,7 @@ struct tls_ctx_t {
std::to_string(handle.type) + " " +
std::to_string(handle.idx));
if (handle.idx >= timers.size()) {
timers.resize(handle.idx + 1);
timers.resize(handle.idx * 2 + 1);
}
timers[handle.idx].end(time);
lock.unlock();
Expand Down
Loading