From ed17f9a1733dd700f6ac619d1f7a2de9a6e8d379 Mon Sep 17 00:00:00 2001 From: Martin Pokorny Date: Fri, 25 Oct 2024 15:44:28 -0600 Subject: [PATCH] prov/udp: detect and use MTU to set max_msg_size and inject_size For each interface detected by the udp provider, determine the MTU of the interface, and use that value to set the max_msg_size field of the fi_ep_attr and fi_tx_attr values of the fi_info element. When the MTU cannot be determined, the MTU value assumed by previous code versions (1500) is used. Signed-off-by: Martin Pokorny --- include/freebsd/osd.h | 7 +++-- include/linux/osd.h | 2 ++ include/ofi_net.h | 1 + include/osx/osd.h | 5 +++ include/windows/ifaddrs.h | 2 +- include/windows/osd.h | 2 ++ man/fi_udp.7.md | 10 +++--- prov/udp/src/udpx.h | 7 +++-- prov/udp/src/udpx_attr.c | 66 ++++++++++++++++++++++++++++++++++++--- prov/udp/src/udpx_init.c | 13 ++++++-- src/common.c | 24 +++++++++----- src/linux/osd.c | 28 +++++++++++++++++ src/windows/osd.c | 6 ++++ 13 files changed, 148 insertions(+), 25 deletions(-) diff --git a/include/freebsd/osd.h b/include/freebsd/osd.h index 81414d1d59c..a137f7ef5e8 100644 --- a/include/freebsd/osd.h +++ b/include/freebsd/osd.h @@ -76,6 +76,11 @@ static inline size_t ofi_ifaddr_get_speed(struct ifaddrs *ifa) return 0; } +static inline int ofi_ifaddr_get_mtu(const struct ifaddrs *ifa) +{ + return -1; +} + static inline ssize_t ofi_process_vm_readv(pid_t pid, const struct iovec *local_iov, unsigned long liovcnt, @@ -185,5 +190,3 @@ ofi_recvv_socket(SOCKET fd, const struct iovec *iov, size_t cnt, int flags) } #endif /* _FREEBSD_OSD_H_ */ - - diff --git a/include/linux/osd.h b/include/linux/osd.h index 5b8d0fcd4ee..2d3108de594 100644 --- a/include/linux/osd.h +++ b/include/linux/osd.h @@ -92,6 +92,8 @@ static inline int ofi_hugepage_enabled(void) size_t ofi_ifaddr_get_speed(struct ifaddrs *ifa); +int ofi_ifaddr_get_mtu(const struct ifaddrs *ifa); + #ifndef __NR_process_vm_readv # define __NR_process_vm_readv 310 #endif diff --git a/include/ofi_net.h b/include/ofi_net.h index c9f4df00774..1abe9435784 100644 --- a/include/ofi_net.h +++ b/include/ofi_net.h @@ -655,6 +655,7 @@ struct ofi_addr_list_entry { char ipstr[INET6_ADDRSTRLEN]; union ofi_sock_ip ipaddr; size_t speed; + int mtu; char net_name[OFI_ADDRSTRLEN]; char ifa_name[OFI_ADDRSTRLEN]; uint64_t comm_caps; diff --git a/include/osx/osd.h b/include/osx/osd.h index 2f7494af6e1..8deae7e1e41 100644 --- a/include/osx/osd.h +++ b/include/osx/osd.h @@ -99,6 +99,11 @@ static inline size_t ofi_ifaddr_get_speed(struct ifaddrs *ifa) return 0; } +static inline int ofi_ifaddr_get_mtu(const struct ifaddrs *ifa) +{ + return -1; +} + static inline int ofi_hugepage_enabled(void) { return 0; diff --git a/include/windows/ifaddrs.h b/include/windows/ifaddrs.h index 02e657cb480..6aad728995d 100644 --- a/include/windows/ifaddrs.h +++ b/include/windows/ifaddrs.h @@ -34,8 +34,8 @@ struct ifaddrs { char ad_name[16]; size_t speed; + int mtu; }; int getifaddrs(struct ifaddrs **ifap); void freeifaddrs(struct ifaddrs *ifa); - diff --git a/include/windows/osd.h b/include/windows/osd.h index d9698bd9724..102cc9e428a 100644 --- a/include/windows/osd.h +++ b/include/windows/osd.h @@ -1006,6 +1006,8 @@ static inline int ofi_is_loopback_addr(struct sockaddr *addr) { size_t ofi_ifaddr_get_speed(struct ifaddrs *ifa); +int ofi_ifaddr_get_mtu(const struct ifaddrs *ifa); + #define file2unix_time 10000000i64 #define win2unix_epoch 116444736000000000i64 #define CLOCK_REALTIME 0 diff --git a/man/fi_udp.7.md b/man/fi_udp.7.md index cbe0a371012..27c518be42f 100644 --- a/man/fi_udp.7.md +++ b/man/fi_udp.7.md @@ -41,9 +41,8 @@ receiving datagram messages over an unreliable endpoint. # LIMITATIONS -The UDP provider has hard-coded maximums for supported queue sizes and data -transfers. These values are reflected in the related fabric attribute -structures +The UDP provider has a hard-coded maximum for supported queue sizes. +This value is reflected in the related fabric attribute structures. EPs must be bound to both RX and TX CQs. @@ -53,7 +52,10 @@ No support for counters. # RUNTIME PARAMETERS -No runtime parameters are currently defined. +The UDP provider checks for the following environment variables - + +*FI_UDP_IFACE* +: An string value that specifies the name of the interface. # SEE ALSO diff --git a/prov/udp/src/udpx.h b/prov/udp/src/udpx.h index a52ff392b99..8833a582e43 100644 --- a/prov/udp/src/udpx.h +++ b/prov/udp/src/udpx.h @@ -63,22 +63,21 @@ #ifndef _UDPX_H_ #define _UDPX_H_ - extern struct fi_provider udpx_prov; extern struct util_prov udpx_util_prov; extern struct fi_info udpx_info; - int udpx_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric, void *context); int udpx_domain_open(struct fid_fabric *fabric, struct fi_info *info, struct fid_domain **dom, void *context); int udpx_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr, struct fid_eq **eq, void *context); - +void udpx_util_prov_init(uint32_t version); #define UDPX_FLAG_MULTI_RECV 1 #define UDPX_IOV_LIMIT 4 +#define UDPX_MTU 1500 struct udpx_ep_entry { void *context; @@ -88,6 +87,8 @@ struct udpx_ep_entry { uint8_t resv[sizeof(size_t) - 2]; }; +#define UDPX_MAX_MSG_SIZE(mtu) ((mtu) - 28) + OFI_DECLARE_CIRQUE(struct udpx_ep_entry, udpx_rx_cirq); struct udpx_ep; diff --git a/prov/udp/src/udpx_attr.c b/prov/udp/src/udpx_attr.c index 7737efe86fc..7eaf5d1e8e9 100644 --- a/prov/udp/src/udpx_attr.c +++ b/prov/udp/src/udpx_attr.c @@ -31,6 +31,7 @@ */ #include "udpx.h" +#include "ofi_osd.h" #define UDPX_TX_CAPS (OFI_TX_MSG_CAPS | FI_MULTICAST) #define UDPX_RX_CAPS (FI_SOURCE | OFI_RX_MSG_CAPS) @@ -38,7 +39,7 @@ struct fi_tx_attr udpx_tx_attr = { .caps = UDPX_TX_CAPS, - .inject_size = 1472, + .inject_size = UDPX_MAX_MSG_SIZE(UDPX_MTU), .size = 1024, .iov_limit = UDPX_IOV_LIMIT }; @@ -53,7 +54,7 @@ struct fi_ep_attr udpx_ep_attr = { .type = FI_EP_DGRAM, .protocol = FI_PROTO_UDP, .protocol_version = 0, - .max_msg_size = 1472, + .max_msg_size = UDPX_MAX_MSG_SIZE(UDPX_MTU), .tx_ctx_cnt = 1, .rx_ctx_cnt = 1 }; @@ -93,6 +94,63 @@ struct fi_info udpx_info = { struct util_prov udpx_util_prov = { .prov = &udpx_prov, - .info = &udpx_info, - .flags = 0, + .info = NULL, + .flags = 0, }; + + +static int match_interface(struct slist_entry *entry, const void *infop) +{ + struct ofi_addr_list_entry *addr_entry; + const struct fi_info* info = infop; + addr_entry = container_of(entry, struct ofi_addr_list_entry, entry); + return strcmp(addr_entry->net_name, info->fabric_attr->name) == 0 && + strcmp(addr_entry->ifa_name, info->domain_attr->name) == 0; +} + +static void set_mtu_from_addr_list(struct fi_info* info, + struct slist *addr_list) +{ + struct ofi_addr_list_entry *addr_entry; + struct slist_entry *entry; + int max_msg_size; + + entry = slist_find_first_match(addr_list, match_interface, info); + assert(entry); + if (entry) { + addr_entry = container_of(entry, + struct ofi_addr_list_entry, + entry); + max_msg_size = UDPX_MAX_MSG_SIZE(addr_entry->mtu); + if (max_msg_size > 0) { + info->tx_attr->inject_size = max_msg_size; + info->ep_attr->max_msg_size = max_msg_size; + } + } else { + FI_DBG(&udpx_prov, FI_LOG_CORE, + "Failed to match interface (%s, %s) to " + "address for MTU size\n", + info->fabric_attr->name, info->domain_attr->name); + } +} + +void udpx_util_prov_init(uint32_t version) { + + struct slist addr_list; + struct slist_entry *entry; + struct fi_info* cur; + struct fi_info* info; + + if (udpx_util_prov.info == NULL) { + udpx_util_prov.info = &udpx_info; + info = fi_allocinfo(); + ofi_ip_getinfo(&udpx_util_prov, version, NULL, NULL, 0, NULL, + &info); + slist_init(&addr_list); + ofi_get_list_of_addr(&udpx_prov, "iface", &addr_list); + for (cur = info; cur; cur = cur->next) + set_mtu_from_addr_list(cur, &addr_list); + udpx_util_prov.info = info; + ofi_free_list_of_addr(&addr_list); + } +} diff --git a/prov/udp/src/udpx_init.c b/prov/udp/src/udpx_init.c index 85ad70f749b..69b48e8213a 100644 --- a/prov/udp/src/udpx_init.c +++ b/prov/udp/src/udpx_init.c @@ -37,20 +37,26 @@ #include +static ofi_mutex_t init_lock; static int udpx_getinfo(uint32_t version, const char *node, const char *service, uint64_t flags, const struct fi_info *hints, struct fi_info **info) { - return ofi_ip_getinfo(&udpx_util_prov, version, node, service, flags, - hints, info); + ofi_mutex_lock(&init_lock); + udpx_util_prov_init(version); + ofi_mutex_unlock(&init_lock); + return util_getinfo(&udpx_util_prov, version, node, service, flags, + hints, info); } static void udpx_fini(void) { - /* yawn */ + if (udpx_util_prov.info != NULL) + fi_freeinfo(udpx_util_prov.info); } + struct fi_provider udpx_prov = { .name = "udp", .version = OFI_VERSION_DEF_PROV, @@ -65,5 +71,6 @@ UDP_INI fi_param_define(&udpx_prov, "iface", FI_PARAM_STRING, "Specify interface name"); + ofi_mutex_init(&init_lock); return &udpx_prov; } diff --git a/src/common.c b/src/common.c index a5f5ba5a22e..1280f216820 100644 --- a/src/common.c +++ b/src/common.c @@ -1942,7 +1942,8 @@ void ofi_free_list_of_addr(struct slist *addr_list) } static inline -void ofi_insert_loopback_addr(const struct fi_provider *prov, struct slist *addr_list) +void ofi_insert_loopback_addr(const struct fi_provider *prov, + struct slist *addr_list, int mtu) { struct ofi_addr_list_entry *addr_entry; @@ -1953,6 +1954,7 @@ void ofi_insert_loopback_addr(const struct fi_provider *prov, struct slist *addr addr_entry->comm_caps = FI_LOCAL_COMM; addr_entry->ipaddr.sin.sin_family = AF_INET; addr_entry->ipaddr.sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + addr_entry->mtu = mtu; ofi_straddr_log(prov, FI_LOG_INFO, FI_LOG_CORE, "available addr: ", &addr_entry->ipaddr); @@ -1968,6 +1970,7 @@ void ofi_insert_loopback_addr(const struct fi_provider *prov, struct slist *addr addr_entry->comm_caps = FI_LOCAL_COMM; addr_entry->ipaddr.sin6.sin6_family = AF_INET6; addr_entry->ipaddr.sin6.sin6_addr = in6addr_loopback; + addr_entry->mtu = mtu; ofi_straddr_log(prov, FI_LOG_INFO, FI_LOG_CORE, "available addr: ", &addr_entry->ipaddr); @@ -2062,7 +2065,7 @@ void ofi_set_netmask_str(char *netstr, size_t len, struct ifaddrs *ifa) void ofi_get_list_of_addr(const struct fi_provider *prov, const char *env_name, struct slist *addr_list) { - int ret; + int ret, mtu = -1; char *iface = NULL; struct ofi_addr_list_entry *addr_entry; struct ifaddrs *ifaddrs, *ifa; @@ -2089,10 +2092,13 @@ void ofi_get_list_of_addr(const struct fi_provider *prov, const char *env_name, if (ifa->ifa_addr == NULL || !(ifa->ifa_flags & IFF_UP) || !(ifa->ifa_flags & IFF_RUNNING) || - (ifa->ifa_flags & IFF_LOOPBACK) || ((ifa->ifa_addr->sa_family != AF_INET) && (ifa->ifa_addr->sa_family != AF_INET6))) continue; + if (ifa->ifa_flags & IFF_LOOPBACK) { + mtu = ofi_ifaddr_get_mtu(ifa); + continue; + } if (iface && strncmp(iface, ifa->ifa_name, strlen(iface) + 1)) { FI_DBG(prov, FI_LOG_CORE, "Skip (%s) interface\n", ifa->ifa_name); @@ -2122,9 +2128,11 @@ void ofi_get_list_of_addr(const struct fi_provider *prov, const char *env_name, } addr_entry->speed = ofi_ifaddr_get_speed(ifa); + addr_entry->mtu = ofi_ifaddr_get_mtu(ifa); FI_INFO(prov, FI_LOG_CORE, "Available addr: %s, " - "iface name: %s, speed: %zu\n", - addr_entry->ipstr, ifa->ifa_name, addr_entry->speed); + "iface name: %s, speed: %zu, mtu: %d\n", + addr_entry->ipstr, ifa->ifa_name, addr_entry->speed, + addr_entry->mtu); slist_insert_before_first_match(addr_list, ofi_compare_addr_entry, &addr_entry->entry); @@ -2136,7 +2144,7 @@ void ofi_get_list_of_addr(const struct fi_provider *prov, const char *env_name, /* Always add loopback address at the end */ if (!iface || !strncmp(iface, "lo", strlen(iface) + 1) || !strncmp(iface, "loopback", strlen(iface) + 1)) - ofi_insert_loopback_addr(prov, addr_list); + ofi_insert_loopback_addr(prov, addr_list, mtu); } #elif defined HAVE_MIB_IPADDRTABLE @@ -2182,7 +2190,7 @@ void ofi_get_list_of_addr(const struct fi_provider *prov, const char *env_name, } /* Always add loopback address at the end */ - ofi_insert_loopback_addr(prov, addr_list); + ofi_insert_loopback_addr(prov, addr_list, -1); out: if (iptbl != &_iptbl) @@ -2194,7 +2202,7 @@ void ofi_get_list_of_addr(const struct fi_provider *prov, const char *env_name, void ofi_get_list_of_addr(const struct fi_provider *prov, const char *env_name, struct slist *addr_list) { - ofi_insert_loopback_addr(prov, addr_list); + ofi_insert_loopback_addr(prov, addr_list, -1); } #endif diff --git a/src/linux/osd.c b/src/linux/osd.c index cb848367506..9f8761b4e5b 100644 --- a/src/linux/osd.c +++ b/src/linux/osd.c @@ -257,3 +257,31 @@ size_t ofi_ifaddr_get_speed(struct ifaddrs *ifa) } #endif /* HAVE_ETHTOOL */ + +int ofi_ifaddr_get_mtu(const struct ifaddrs *ifa) +{ + FILE *fp; + char *mtu_filename; + int mtu; + + if (asprintf(&mtu_filename, "/sys/class/net/%s/mtu", + ifa->ifa_name) == -1) + return 0; + + fp = fopen(mtu_filename, "r"); + if (!fp) + goto err1; + + if (fscanf(fp, "%d", &mtu) != 1) + goto err2; + + fclose(fp); + free(mtu_filename); + + return mtu; + err2: + fclose(fp); + err1: + free(mtu_filename); + return 0; +} diff --git a/src/windows/osd.c b/src/windows/osd.c index 6f693514c7f..82282cc3bd4 100644 --- a/src/windows/osd.c +++ b/src/windows/osd.c @@ -477,6 +477,7 @@ int getifaddrs(struct ifaddrs **ifap) (*addr6) = *(struct sockaddr_in6 *) pSockAddr; } fa->speed = aa->TransmitLinkSpeed; + fa->mtu = (int)aa->Mtu; /* Generate fake Unix-like device names */ sprintf_s(fa->ad_name, sizeof(fa->ad_name), "eth%d", i++); } @@ -497,6 +498,11 @@ size_t ofi_ifaddr_get_speed(struct ifaddrs *ifa) return ifa->speed; } +int ofi_ifaddr_get_mtu(const struct ifaddrs *ifa) +{ + return ifa->mtu; +} + void freeifaddrs(struct ifaddrs *ifa) { while (ifa) {