From 41ad9f7fdd484572b4d5946ba44312befe7fe19b Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Fri, 3 Jan 2025 22:03:28 +0000 Subject: [PATCH] btl/uct: reduce number of messages sent when establishing connections The btl/uct code can be quite aggressive at sends connection messages over the connection endpoint. This could lead to a large number of unnecessary messages in some cases. This commit adds code to restrict the retry rate to 2ms. This timing is controlled by a new MCA variable: btl_uct_connection_retry_timeout. Signed-off-by: Nathan Hjelm --- opal/mca/btl/uct/btl_uct.h | 7 ++- opal/mca/btl/uct/btl_uct_component.c | 13 ++++- opal/mca/btl/uct/btl_uct_endpoint.c | 82 +++++++++++++++++----------- opal/mca/btl/uct/btl_uct_types.h | 6 ++ 4 files changed, 74 insertions(+), 34 deletions(-) diff --git a/opal/mca/btl/uct/btl_uct.h b/opal/mca/btl/uct/btl_uct.h index 85f9d9b33a8..65bc69fddb2 100644 --- a/opal/mca/btl/uct/btl_uct.h +++ b/opal/mca/btl/uct/btl_uct.h @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2015-2018 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2019 Google, LLC. All rights reserved. + * Copyright (c) 2019-2025 Google, LLC. All rights reserved. * Copyright (c) 2019 Intel, Inc. All rights reserved. * Copyright (c) 2020 Amazon.com, Inc. or its affiliates. * All Rights reserved. @@ -40,6 +40,8 @@ #include "opal/mca/mpool/mpool.h" #include "opal/mca/pmix/pmix-internal.h" #include "opal/mca/rcache/base/base.h" +#include "opal/mca/threads/condition.h" +#include "opal/mca/threads/mutex.h" #include "opal/mca/threads/tsd.h" #include "opal/util/event.h" #include @@ -153,6 +155,9 @@ struct mca_btl_uct_component_t { /** disable UCX memory hooks */ bool disable_ucx_memory_hooks; + + /** connection retry timeout */ + unsigned int connection_retry_timeout; }; typedef struct mca_btl_uct_component_t mca_btl_uct_component_t; diff --git a/opal/mca/btl/uct/btl_uct_component.c b/opal/mca/btl/uct/btl_uct_component.c index 65060e17819..72546ba46b4 100644 --- a/opal/mca/btl/uct/btl_uct_component.c +++ b/opal/mca/btl/uct/btl_uct_component.c @@ -17,7 +17,7 @@ * Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved. * Copyright (c) 2018-2024 Triad National Security, LLC. All rights * reserved. - * Copyright (c) 2019-2024 Google, LLC. All rights reserved. + * Copyright (c) 2019-2025 Google, LLC. All rights reserved. * Copyright (c) 2019 Intel, Inc. All rights reserved. * Copyright (c) 2022 IBM Corporation. All rights reserved. * $COPYRIGHT$ @@ -102,6 +102,17 @@ static int mca_btl_uct_component_register(void) MCA_BASE_VAR_SCOPE_ALL, &mca_btl_uct_component.bind_threads_to_contexts); #endif + /* timeout between connection message attempts in µs */ + mca_btl_uct_component.connection_retry_timeout = 2000; + (void) mca_base_component_var_register( + &mca_btl_uct_component.super.btl_version, "connection_retry_timeout", + "Timeout between attempts to send connection messages for connect-to-" + "endpoint connections. The timeout is measured in µs and is only" + "necessary when using unreliable transports for connections (ex: UD). " + "(default: 2000µs)", + MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_4, + MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_uct_component.connection_retry_timeout); + /* for now we want this component to lose to btl/ugni and btl/vader */ module->super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH; diff --git a/opal/mca/btl/uct/btl_uct_endpoint.c b/opal/mca/btl/uct/btl_uct_endpoint.c index 04367ccf2f4..aa255a0f7f1 100644 --- a/opal/mca/btl/uct/btl_uct_endpoint.c +++ b/opal/mca/btl/uct/btl_uct_endpoint.c @@ -4,7 +4,7 @@ * reserved. * Copyright (c) 2018 Triad National Security, LLC. All rights * reserved. - * Copyright (c) 2019 Google, LLC. All rights reserved. + * Copyright (c) 2019-2025 Google, LLC. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -16,6 +16,7 @@ #include "btl_uct.h" #include "btl_uct_am.h" #include "btl_uct_device_context.h" +#include "opal/mca/timer/base/base.h" #include "opal/util/proc.h" static void mca_btl_uct_endpoint_construct(mca_btl_uct_endpoint_t *endpoint) @@ -257,21 +258,17 @@ static int mca_btl_uct_endpoint_send_conn_req(mca_btl_uct_module_t *uct_btl, return OPAL_SUCCESS; } -static int mca_btl_uct_endpoint_connect_endpoint( +static int mca_btl_uct_endpoint_send_connection_data( mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, mca_btl_uct_tl_t *tl, mca_btl_uct_device_context_t *tl_context, mca_btl_uct_tl_endpoint_t *tl_endpoint, - uint8_t *tl_data, uint8_t *conn_tl_data, void *ep_addr) + uint8_t *conn_tl_data, int request_type) { - size_t request_length = sizeof(mca_btl_uct_conn_req_t) - + MCA_BTL_UCT_TL_ATTR(tl, tl_context->context_id).ep_addr_len; - mca_btl_uct_connection_ep_t *conn_ep = endpoint->conn_ep; mca_btl_uct_tl_t *conn_tl = uct_btl->conn_tl; mca_btl_uct_device_context_t *conn_tl_context = conn_tl->uct_dev_contexts[0]; - mca_btl_uct_conn_req_t *request = alloca(request_length); + mca_btl_uct_connection_ep_t *conn_ep = endpoint->conn_ep; uct_device_addr_t *device_addr = NULL; uct_iface_addr_t *iface_addr; ucs_status_t ucs_status; - int rc; assert(NULL != conn_tl); @@ -302,15 +299,50 @@ static int mca_btl_uct_endpoint_connect_endpoint( ucs_status)); return OPAL_ERROR; } - } else { - OBJ_RETAIN(conn_ep); } + size_t request_length = sizeof(mca_btl_uct_conn_req_t) + + MCA_BTL_UCT_TL_ATTR(tl, tl_context->context_id).ep_addr_len; + mca_btl_uct_conn_req_t *request = alloca(request_length); + /* fill in common request parameters */ request->proc_name = OPAL_PROC_MY_NAME; request->context_id = tl_context->context_id; request->tl_index = tl->tl_index; - request->type = !!(ep_addr); + request->type = request_type; + + /* fill in connection request */ + ucs_status = uct_ep_get_address(tl_endpoint->uct_ep, (uct_ep_addr_t *) request->ep_addr); + if (UCS_OK != ucs_status) { + /* this is a fatal a fatal error */ + OBJ_RELEASE(endpoint->conn_ep); + uct_ep_destroy(tl_endpoint->uct_ep); + tl_endpoint->uct_ep = NULL; + return OPAL_ERROR; + } + + /* let the remote side know that the connection has been established and + * wait for the message to be sent */ + int rc = mca_btl_uct_endpoint_send_conn_req(uct_btl, endpoint, conn_tl_context, request, + request_length); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + OBJ_RELEASE(endpoint->conn_ep); + uct_ep_destroy(tl_endpoint->uct_ep); + tl_endpoint->uct_ep = NULL; + return OPAL_ERROR; + } + + tl_endpoint->last_connection_req = opal_timer_base_get_usec(); + + return OPAL_SUCCESS; +} + +static int mca_btl_uct_endpoint_connect_endpoint( + mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, mca_btl_uct_tl_t *tl, + mca_btl_uct_device_context_t *tl_context, mca_btl_uct_tl_endpoint_t *tl_endpoint, + uint8_t *tl_data, uint8_t *conn_tl_data, void *ep_addr) +{ + ucs_status_t ucs_status; if (NULL == tl_endpoint->uct_ep) { BTL_VERBOSE(("allocating endpoint for peer %s and sending connection data", @@ -338,29 +370,15 @@ static int mca_btl_uct_endpoint_connect_endpoint( } } - /* fill in connection request */ - ucs_status = uct_ep_get_address(tl_endpoint->uct_ep, (uct_ep_addr_t *) request->ep_addr); - if (UCS_OK != ucs_status) { - /* this is a fatal a fatal error */ - OBJ_RELEASE(endpoint->conn_ep); - uct_ep_destroy(tl_endpoint->uct_ep); - tl_endpoint->uct_ep = NULL; - return OPAL_ERROR; - } - - /* let the remote side know that the connection has been established and - * wait for the message to be sent */ - rc = mca_btl_uct_endpoint_send_conn_req(uct_btl, endpoint, conn_tl_context, request, - request_length); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - OBJ_RELEASE(endpoint->conn_ep); - uct_ep_destroy(tl_endpoint->uct_ep); - tl_endpoint->uct_ep = NULL; - return OPAL_ERROR; + opal_timer_t now = opal_timer_base_get_usec(); + if ((now - tl_endpoint->last_connection_req) < mca_btl_uct_component.connection_retry_timeout && !ep_addr) { + return (tl_endpoint->flags & MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY) ? OPAL_SUCCESS + : OPAL_ERR_OUT_OF_RESOURCE; } - return (tl_endpoint->flags & MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY) ? OPAL_SUCCESS - : OPAL_ERR_OUT_OF_RESOURCE; + int rc = mca_btl_uct_endpoint_send_connection_data(uct_btl, endpoint, tl, tl_context, tl_endpoint, + conn_tl_data, /*request_type=*/!!ep_addr); + return (OPAL_SUCCESS == rc) ? OPAL_ERR_OUT_OF_RESOURCE : rc; } int mca_btl_uct_endpoint_connect(mca_btl_uct_module_t *uct_btl, mca_btl_uct_endpoint_t *endpoint, diff --git a/opal/mca/btl/uct/btl_uct_types.h b/opal/mca/btl/uct/btl_uct_types.h index 156451fa307..cd331986b8a 100644 --- a/opal/mca/btl/uct/btl_uct_types.h +++ b/opal/mca/btl/uct/btl_uct_types.h @@ -2,6 +2,7 @@ /* * Copyright (c) 2018 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2025 Google, LLC. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -14,6 +15,8 @@ # include "opal/mca/btl/btl.h" +#include "opal/mca/timer/base/base.h" + /* forward declarations */ struct mca_btl_uct_module_t; struct mca_btl_base_endpoint_t; @@ -100,6 +103,9 @@ struct mca_btl_uct_tl_endpoint_t { /** UCT endpoint handle */ uct_ep_h uct_ep; + + /** Time of last connection message. */ + opal_timer_t last_connection_req; }; typedef struct mca_btl_uct_tl_endpoint_t mca_btl_uct_tl_endpoint_t;