diff --git a/lci/api/lci.h b/lci/api/lci.h index 5f9ab289..434096a2 100644 --- a/lci/api/lci.h +++ b/lci/api/lci.h @@ -576,6 +576,12 @@ extern int LCI_RECV_SLOW_DOWN_USEC; */ extern bool LCI_IBV_ENABLE_TD; +/** + * @ingroup LCI_COMM + * @brief Which gid index to use for the ibv backend. + */ +extern int LCI_IBV_GID_IDX; + /** * @ingroup LCI_COMM * @brief Whether to enable the progress specific network endpoint. diff --git a/lci/backend/ibv/lcisi_ibv_detail.c b/lci/backend/ibv/lcisi_ibv_detail.c index f31d501c..426c9380 100644 --- a/lci/backend/ibv/lcisi_ibv_detail.c +++ b/lci/backend/ibv/lcisi_ibv_detail.c @@ -45,10 +45,8 @@ static double translate_speed(uint8_t speed) } } -bool LCISI_ibv_select_best_device_port(struct ibv_device** dev_list, - int num_devices, - struct ibv_device** device_o, - uint8_t* port_o) +bool select_best_device_port(struct ibv_device** dev_list, int num_devices, + struct ibv_device** device_o, uint8_t* port_o) { struct ibv_device* best_device; uint8_t best_port; diff --git a/lci/backend/ibv/lcisi_ibv_detail.h b/lci/backend/ibv/lcisi_ibv_detail.h index 261573bb..d69c5f77 100644 --- a/lci/backend/ibv/lcisi_ibv_detail.h +++ b/lci/backend/ibv/lcisi_ibv_detail.h @@ -3,9 +3,31 @@ #include "infiniband/verbs.h" #include -bool LCISI_ibv_select_best_device_port(struct ibv_device** dev_list, - int num_devices, - struct ibv_device** device_o, - uint8_t* port_o); +bool select_best_device_port(struct ibv_device** dev_list, int num_devices, + struct ibv_device** device_o, uint8_t* port_o); + +void gid_to_wire_gid(const union ibv_gid* gid, char wgid[]) +{ + uint32_t tmp_gid[4]; + int i; + + memcpy(tmp_gid, gid, sizeof(tmp_gid)); + for (i = 0; i < 4; ++i) sprintf(&wgid[i * 8], "%08x", htobe32(tmp_gid[i])); +} + +void wire_gid_to_gid(const char* wgid, union ibv_gid* gid) +{ + char tmp[9]; + __be32 v32; + int i; + uint32_t tmp_gid[4]; + + for (tmp[8] = 0, i = 0; i < 4; ++i) { + memcpy(tmp, wgid + i * 8, 8); + sscanf(tmp, "%x", &v32); + tmp_gid[i] = be32toh(v32); + } + memcpy(gid, tmp_gid, sizeof(*gid)); +} #endif \ No newline at end of file diff --git a/lci/backend/ibv/server_ibv.c b/lci/backend/ibv/server_ibv.c index 59ad892c..318ee907 100644 --- a/lci/backend/ibv/server_ibv.c +++ b/lci/backend/ibv/server_ibv.c @@ -1,4 +1,5 @@ #include "runtime/lcii.h" +#include "backend/ibv/lcisi_ibv_detail.h" static const int max_sge_num = 1; static const int inline_size = 236; @@ -85,8 +86,8 @@ void LCISD_server_init(LCIS_server_t* s) exit(EXIT_FAILURE); } - bool ret = LCISI_ibv_select_best_device_port( - server->dev_list, num_devices, &server->ib_dev, &server->dev_port); + bool ret = select_best_device_port(server->dev_list, num_devices, + &server->ib_dev, &server->dev_port); LCI_Assert(ret, "Cannot find available ibv device/port!\n"); // ibv_open_device provides the user with a verbs context which is the object @@ -167,6 +168,20 @@ void LCISD_server_init(LCIS_server_t* s) LCI_Log(LCI_LOG_INFO, "ibv", "Maximum MTU: %s; Active MTU: %s\n", mtu_str(server->port_attr.max_mtu), mtu_str(server->port_attr.active_mtu)); + + // query the gid + server->gid_idx = LCI_IBV_GID_IDX; + if (server->gid_idx >= 0) { + LCI_Log(LCI_LOG_INFO, "ibv", "Use GID index: %d\n", server->gid_idx); + if (ibv_query_gid(server->dev_ctx, server->dev_port, server->gid_idx, + &server->gid)) { + fprintf(stderr, "can't read sgid of index %d\n", server->gid_idx); + exit(EXIT_FAILURE); + } + } else + memset(&server->gid, 0, sizeof(server->gid)); + + // Initialize the event polling thread LCISI_event_polling_thread_init(server); } @@ -337,12 +352,15 @@ void LCISD_endpoint_init(LCIS_server_t server_pp, LCIS_endpoint_t* endpoint_pp, exit(EXIT_FAILURE); } } + char wgid[33]; + memset(wgid, 0, sizeof(wgid)); + gid_to_wire_gid(&endpoint_p->server->gid, wgid); // Use this queue pair "i" to connect to rank e. char key[LCT_PMI_STRING_LIMIT + 1]; sprintf(key, "LCI_KEY_%d_%d_%d", endpoint_id, LCI_RANK, i); char value[LCT_PMI_STRING_LIMIT + 1]; - sprintf(value, "%x:%hx", endpoint_p->qps[i]->qp_num, - endpoint_p->server->port_attr.lid); + sprintf(value, "%x:%hx:%s", endpoint_p->qps[i]->qp_num, + endpoint_p->server->port_attr.lid, wgid); LCT_pmi_publish(key, value); } LCI_Log(LCI_LOG_INFO, "ibv", "Current inline data size is %d\n", inline_size); @@ -356,7 +374,10 @@ void LCISD_endpoint_init(LCIS_server_t server_pp, LCIS_endpoint_t* endpoint_pp, LCT_pmi_getname(i, key, value); uint32_t dest_qpn; uint16_t dest_lid; - sscanf(value, "%x:%hx", &dest_qpn, &dest_lid); + union ibv_gid gid; + char wgid[33]; + sscanf(value, "%x:%hx:%s", &dest_qpn, &dest_lid, wgid); + wire_gid_to_gid(wgid, &gid); // Once a queue pair (QP) has receive buffers posted to it, it is now // possible to transition the QP into the ready to receive (RTR) state. { @@ -383,6 +404,13 @@ void LCISD_endpoint_init(LCIS_server_t server_pp, LCIS_endpoint_t* endpoint_pp, attr.min_rnr_timer = 12; // should not be necessary to set these, given is_global = 0 memset(&attr.ah_attr.grh, 0, sizeof attr.ah_attr.grh); + // If we are using gid + if (gid.global.interface_id) { + attr.ah_attr.is_global = 1; + attr.ah_attr.grh.hop_limit = 1; + attr.ah_attr.grh.dgid = gid; + attr.ah_attr.grh.sgid_index = endpoint_p->server->gid_idx; + } int flags = IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | diff --git a/lci/backend/ibv/server_ibv.h b/lci/backend/ibv/server_ibv.h index cb9d47e0..6c3a10ba 100644 --- a/lci/backend/ibv/server_ibv.h +++ b/lci/backend/ibv/server_ibv.h @@ -29,6 +29,8 @@ typedef struct __attribute__((aligned(LCI_CACHE_LINE))) LCISI_server_t { uint8_t dev_port; struct ibv_mr* odp_mr; size_t max_inline; + int gid_idx; + union ibv_gid gid; // event polling thread pthread_t event_polling_thread; atomic_bool event_polling_thread_run; diff --git a/lci/backend/server.h b/lci/backend/server.h index 777a91d2..3f39c637 100644 --- a/lci/backend/server.h +++ b/lci/backend/server.h @@ -95,7 +95,6 @@ static inline LCI_error_t LCISD_post_recv(LCIS_endpoint_t endpoint_pp, #endif #ifdef LCI_USE_SERVER_IBV #include "backend/ibv/server_ibv.h" -#include "backend/ibv/lcisi_ibv_detail.h" #endif #ifdef LCI_USE_SERVER_UCX #include "backend/ucx/server_ucx.h" diff --git a/lci/runtime/env.c b/lci/runtime/env.c index df29ae5f..cb93e7fd 100644 --- a/lci/runtime/env.c +++ b/lci/runtime/env.c @@ -25,6 +25,7 @@ LCI_API bool LCI_IBV_ENABLE_EVENT_POLLING_THREAD; LCI_API int LCI_SEND_SLOW_DOWN_USEC; LCI_API int LCI_RECV_SLOW_DOWN_USEC; LCI_API bool LCI_IBV_ENABLE_TD; +LCI_API int LCI_IBV_GID_IDX; LCI_API bool LCI_ENABLE_PRG_NET_ENDPOINT; LCI_API LCI_rdv_protocol_t LCI_RDV_PROTOCOL; LCI_API bool LCI_OFI_CXI_TRY_NO_HACK; @@ -85,6 +86,7 @@ void LCII_env_init(int num_proc, int rank) #endif LCI_IBV_ENABLE_TD = LCIU_getenv_or("LCI_IBV_ENABLE_TD", LCI_IBV_ENABLE_TD_DEFAULT); + LCI_IBV_GID_IDX = LCIU_getenv_or("LCI_IBV_GID_IDX", -1); LCI_ENABLE_PRG_NET_ENDPOINT = LCIU_getenv_or( "LCI_ENABLE_PRG_NET_ENDPOINT", LCI_ENABLE_PRG_NET_ENDPOINT_DEFAULT); LCI_MEDIUM_SIZE = LCI_PACKET_SIZE - sizeof(struct LCII_packet_context); diff --git a/lct/pmi/pmi_wrapper.cpp b/lct/pmi/pmi_wrapper.cpp index 8a926381..98bfcc33 100644 --- a/lct/pmi/pmi_wrapper.cpp +++ b/lct/pmi/pmi_wrapper.cpp @@ -86,11 +86,15 @@ int LCT_pmi_get_rank() { return lcti_pmi_ops.get_rank(); } int LCT_pmi_get_size() { return lcti_pmi_ops.get_size(); } void LCT_pmi_publish(char* key, char* value) { + LCT_Log(LCT_log_ctx_default, LCT_LOG_DEBUG, "pmi", "publish %s %s\n", key, + value); lcti_pmi_ops.publish(key, value); } void LCT_pmi_getname(int rank, char* key, char* value) { lcti_pmi_ops.getname(rank, key, value); + LCT_Log(LCT_log_ctx_default, LCT_LOG_DEBUG, "pmi", "getname %d %s %s\n", rank, + key, value); } void LCT_pmi_barrier() {