From 59e43b9f0e48675d0ee52845bd0c6b676f4cf7c7 Mon Sep 17 00:00:00 2001 From: Jiakun Yan Date: Wed, 18 Sep 2024 18:28:12 -0500 Subject: [PATCH] add ib roce gid auto selection --- .gitignore | 1 + lci/backend/ibv/lcisi_ibv_detail.c | 118 ++++++++++++++++++++++++++++- lci/backend/ibv/lcisi_ibv_detail.h | 24 +----- lci/backend/ibv/server_ibv.c | 7 +- lct/CMakeLists.txt | 3 +- lct/api/lct.h | 4 + lct/util/io.cpp | 39 ++++++++++ 7 files changed, 172 insertions(+), 24 deletions(-) create mode 100644 lct/util/io.cpp diff --git a/.gitignore b/.gitignore index e6729457..cb24665d 100644 --- a/.gitignore +++ b/.gitignore @@ -29,6 +29,7 @@ # Vscode related .vscode/ +build/ # CLion related .idea diff --git a/lci/backend/ibv/lcisi_ibv_detail.c b/lci/backend/ibv/lcisi_ibv_detail.c index 426c9380..9c3fd31f 100644 --- a/lci/backend/ibv/lcisi_ibv_detail.c +++ b/lci/backend/ibv/lcisi_ibv_detail.c @@ -90,7 +90,7 @@ bool select_best_device_port(struct ibv_device** dev_list, int num_devices, continue; } // Check whether we can get its lid - if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET && !port_attr.lid) { + if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND && !port_attr.lid) { fprintf(stderr, "Couldn't get local LID\n"); continue; } @@ -134,4 +134,120 @@ bool select_best_device_port(struct ibv_device** dev_list, int num_devices, LCI_Log(LCI_LOG_INFO, "ibv", "No device is available!\n"); return false; } +} + +typedef enum roce_version_t { + ROCE_V1, + ROCE_V2, + ROCE_VER_UNKNOWN +} roce_version_t; + +roce_version_t query_gid_roce_version(LCISI_server_t* server, + unsigned gid_index) +{ + char buf[16]; + int ret; + char* dev_name = ibv_get_device_name(server->ib_dev); + + union ibv_gid gid; + ret = ibv_query_gid(server->dev_ctx, server->dev_port, gid_index, &gid); + if (ret == 0) { + ret = LCT_read_file(buf, sizeof(buf), + "/sys/class/infiniband/%s/ports/%d/gid_attrs/types/%d", + dev_name, server->dev_port, gid_index); + if (ret > 0) { + if (!strncmp(buf, "IB/RoCE v1", 10)) { + return ROCE_V1; + } else if (!strncmp(buf, "RoCE v2", 7)) { + return ROCE_V2; + } + } + } + LCI_Log(LCI_LOG_DEBUG, "ibv", + "failed to parse gid type '%s' (dev=%s port=%d index=%d)", buf, + dev_name, server->dev_port, gid_index); + return ROCE_VER_UNKNOWN; +} + +bool test_roce_gid_index(LCISI_server_t* server, uint8_t gid_index) +{ + struct ibv_ah_attr ah_attr; + struct ibv_ah* ah; + union ibv_gid gid; + + IBV_SAFECALL( + ibv_query_gid(server->dev_ctx, server->dev_port, gid_index, &gid)); + + memset(&ah_attr, 0, sizeof(ah_attr)); + ah_attr.port_num = server->dev_port; + ah_attr.is_global = 1; + ah_attr.grh.dgid = gid; + ah_attr.grh.sgid_index = gid_index; + ah_attr.grh.hop_limit = 255; + ah_attr.grh.flow_label = 1; + ah_attr.dlid = 0xC000; + + ah = ibv_create_ah(server->dev_pd, &ah_attr); + if (ah == NULL) { + LCI_Log(LCI_LOG_DEBUG, "ibv", "gid entry %d is not operational", gid_index); + return false; + } + + ibv_destroy_ah(ah); + return true; +} + +int select_best_gid_for_roce(LCISI_server_t* server) +{ + static const roce_version_t roce_prio[] = { + ROCE_V2, + ROCE_V1, + ROCE_VER_UNKNOWN, + }; + int i, prio_idx; + int gid_tbl_len = server->port_attr.gid_tbl_len; + + LCI_Log(LCI_LOG_DEBUG, "ibv", "RoCE gid auto selection among %d gids\n", + gid_tbl_len); + for (prio_idx = 0; prio_idx < sizeof(roce_prio); prio_idx++) { + for (i = 0; i < gid_tbl_len; i++) { + roce_version_t version = query_gid_roce_version(server, i); + + if ((roce_prio[prio_idx] == version) && test_roce_gid_index(server, i)) { + LCI_Log(LCI_LOG_INFO, "ibv", "RoCE gid auto selection: use %d %d\n", i, + version); + return i; + } + } + } + + const int default_gid = 0; + LCI_Log(LCI_LOG_INFO, "ibv", + "RoCE gid auto selection: fall back to the default gid %d\n", + default_gid); + return default_gid; // default gid for roce +} + +void gid_to_wire_gid(const union ibv_gid* gid, char wgid[]) +{ + uint32_t tmp_gid[4]; + int i; + + memcpy(tmp_gid, gid, sizeof(tmp_gid)); + for (i = 0; i < 4; ++i) sprintf(&wgid[i * 8], "%08x", htobe32(tmp_gid[i])); +} + +void wire_gid_to_gid(const char* wgid, union ibv_gid* gid) +{ + char tmp[9]; + __be32 v32; + int i; + uint32_t tmp_gid[4]; + + for (tmp[8] = 0, i = 0; i < 4; ++i) { + memcpy(tmp, wgid + i * 8, 8); + sscanf(tmp, "%x", &v32); + tmp_gid[i] = be32toh(v32); + } + memcpy(gid, tmp_gid, sizeof(*gid)); } \ No newline at end of file diff --git a/lci/backend/ibv/lcisi_ibv_detail.h b/lci/backend/ibv/lcisi_ibv_detail.h index d69c5f77..890cd6cb 100644 --- a/lci/backend/ibv/lcisi_ibv_detail.h +++ b/lci/backend/ibv/lcisi_ibv_detail.h @@ -6,28 +6,10 @@ bool select_best_device_port(struct ibv_device** dev_list, int num_devices, struct ibv_device** device_o, uint8_t* port_o); -void gid_to_wire_gid(const union ibv_gid* gid, char wgid[]) -{ - uint32_t tmp_gid[4]; - int i; +int select_best_gid_for_roce(LCISI_server_t* server); - memcpy(tmp_gid, gid, sizeof(tmp_gid)); - for (i = 0; i < 4; ++i) sprintf(&wgid[i * 8], "%08x", htobe32(tmp_gid[i])); -} +void gid_to_wire_gid(const union ibv_gid* gid, char wgid[]); -void wire_gid_to_gid(const char* wgid, union ibv_gid* gid) -{ - char tmp[9]; - __be32 v32; - int i; - uint32_t tmp_gid[4]; - - for (tmp[8] = 0, i = 0; i < 4; ++i) { - memcpy(tmp, wgid + i * 8, 8); - sscanf(tmp, "%x", &v32); - tmp_gid[i] = be32toh(v32); - } - memcpy(gid, tmp_gid, sizeof(*gid)); -} +void wire_gid_to_gid(const char* wgid, union ibv_gid* gid); #endif \ No newline at end of file diff --git a/lci/backend/ibv/server_ibv.c b/lci/backend/ibv/server_ibv.c index 318ee907..5f204c78 100644 --- a/lci/backend/ibv/server_ibv.c +++ b/lci/backend/ibv/server_ibv.c @@ -159,7 +159,7 @@ void LCISD_server_init(LCIS_server_t* s) if (rc != 0) { fprintf(stderr, "Unable to query port\n"); exit(EXIT_FAILURE); - } else if (server->port_attr.link_layer != IBV_LINK_LAYER_ETHERNET && + } else if (server->port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND && !server->port_attr.lid) { fprintf(stderr, "Couldn't get local LID\n"); exit(EXIT_FAILURE); @@ -171,6 +171,11 @@ void LCISD_server_init(LCIS_server_t* s) // query the gid server->gid_idx = LCI_IBV_GID_IDX; + if (server->gid_idx < 0 && + server->port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) { + // User did not explicitly specify the gid to use and we are using RoCE + server->gid_idx = select_best_gid_for_roce(server); + } if (server->gid_idx >= 0) { LCI_Log(LCI_LOG_INFO, "ibv", "Use GID index: %d\n", server->gid_idx); if (ibv_query_gid(server->dev_ctx, server->dev_port, server->gid_idx, diff --git a/lct/CMakeLists.txt b/lct/CMakeLists.txt index ed5b3d85..da9a8cd2 100644 --- a/lct/CMakeLists.txt +++ b/lct/CMakeLists.txt @@ -30,7 +30,8 @@ target_sources_relative( tbarrier/tbarrier.cpp util/thread.cpp util/time.cpp - util/string.cpp) + util/string.cpp + util/io.cpp) target_include_directories(LCT PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) diff --git a/lct/api/lct.h b/lct/api/lct.h index 43a59f7d..d9d00c27 100644 --- a/lct/api/lct.h +++ b/lct/api/lct.h @@ -203,6 +203,10 @@ LCT_API bool LCT_tbarrier_test(LCT_tbarrier_t tbarrier, int64_t ticket); LCT_API void LCT_tbarrier_wait(LCT_tbarrier_t tbarrier, int64_t ticket); LCT_API void LCT_tbarrier_arrive_and_wait(LCT_tbarrier_t tbarrier); +// File IO +LCT_API ssize_t LCT_read_file(char* buffer, size_t max, + const char* filename_fmt, ...); + #ifdef __cplusplus } #endif diff --git a/lct/util/io.cpp b/lct/util/io.cpp new file mode 100644 index 00000000..3e1b2c38 --- /dev/null +++ b/lct/util/io.cpp @@ -0,0 +1,39 @@ +#include +#include +#include +#include +#include "lcti.hpp" + +ssize_t read_file_vararg(char* buffer, size_t max, const char* filename_fmt, + va_list ap) +{ + char filename[MAXPATHLEN]; + ssize_t read_bytes; + int fd; + + memset(buffer, 0, max); + + vsnprintf(filename, MAXPATHLEN, filename_fmt, ap); + + fd = open(filename, O_RDONLY); + if (fd < 0) { + return -1; + } + + read_bytes = read(fd, buffer, max - 1); + + close(fd); + return read_bytes; +} + +ssize_t LCT_read_file(char* buffer, size_t max, const char* filename_fmt, ...) +{ + ssize_t read_bytes; + va_list ap; + + va_start(ap, filename_fmt); + read_bytes = read_file_vararg(buffer, max, filename_fmt, ap); + va_end(ap); + + return read_bytes; +} \ No newline at end of file