-
Notifications
You must be signed in to change notification settings - Fork 396
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
prov/udp: detect and use interface MTU to support large messages over UDP #10493
base: main
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -655,6 +655,7 @@ struct ofi_addr_list_entry { | |
char ipstr[INET6_ADDRSTRLEN]; | ||
union ofi_sock_ip ipaddr; | ||
size_t speed; | ||
int mtu; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please separate out adding / setting the mtu to ofi_addr_list_entry into its own patch. That will make it clearer which changes are generic, versus tied specifically to udp. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Makes sense. I'll do that with the next update. |
||
char net_name[OFI_ADDRSTRLEN]; | ||
char ifa_name[OFI_ADDRSTRLEN]; | ||
uint64_t comm_caps; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,14 +31,15 @@ | |
*/ | ||
|
||
#include "udpx.h" | ||
#include "ofi_osd.h" | ||
|
||
#define UDPX_TX_CAPS (OFI_TX_MSG_CAPS | FI_MULTICAST) | ||
#define UDPX_RX_CAPS (FI_SOURCE | OFI_RX_MSG_CAPS) | ||
#define UDPX_DOMAIN_CAPS (FI_LOCAL_COMM | FI_REMOTE_COMM) | ||
|
||
struct fi_tx_attr udpx_tx_attr = { | ||
.caps = UDPX_TX_CAPS, | ||
.inject_size = 1472, | ||
.inject_size = UDPX_MAX_MSG_SIZE(UDPX_MTU), | ||
.size = 1024, | ||
.iov_limit = UDPX_IOV_LIMIT | ||
}; | ||
|
@@ -53,7 +54,7 @@ struct fi_ep_attr udpx_ep_attr = { | |
.type = FI_EP_DGRAM, | ||
.protocol = FI_PROTO_UDP, | ||
.protocol_version = 0, | ||
.max_msg_size = 1472, | ||
.max_msg_size = UDPX_MAX_MSG_SIZE(UDPX_MTU), | ||
.tx_ctx_cnt = 1, | ||
.rx_ctx_cnt = 1 | ||
}; | ||
|
@@ -93,6 +94,69 @@ struct fi_info udpx_info = { | |
|
||
struct util_prov udpx_util_prov = { | ||
.prov = &udpx_prov, | ||
.info = &udpx_info, | ||
.flags = 0, | ||
.info = NULL, | ||
.flags = 0, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. replaced tab with spaces here - revert change |
||
}; | ||
|
||
|
||
static int match_interface(struct slist_entry *entry, const void *infop) | ||
{ | ||
struct ofi_addr_list_entry *addr_entry; | ||
const struct fi_info* info = infop; | ||
aingerson marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
addr_entry = container_of(entry, struct ofi_addr_list_entry, entry); | ||
return strcmp(addr_entry->net_name, info->fabric_attr->name) == 0 && | ||
strcmp(addr_entry->ifa_name, info->domain_attr->name) == 0; | ||
} | ||
|
||
static void set_mtu_from_addr_list(struct fi_info* info, | ||
struct slist *addr_list) | ||
{ | ||
struct ofi_addr_list_entry *addr_entry; | ||
struct slist_entry *entry; | ||
int max_msg_size; | ||
|
||
entry = slist_find_first_match(addr_list, match_interface, info); | ||
if (entry) { | ||
addr_entry = container_of(entry, | ||
struct ofi_addr_list_entry, | ||
entry); | ||
max_msg_size = UDPX_MAX_MSG_SIZE(addr_entry->mtu); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Need to handle the case that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That case is handled in line 125. (The value of UDPX_MAX_MSG_SIZE() is strictly less than its argument value.) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The mtu valid check should be direct, not hidden behind a macro which uses an incorrect value that's then checked. The code is assuming too much about an invalid mtu value and requires too much of the reader to figure out what's happening. |
||
if (max_msg_size > 0) { | ||
info->tx_attr->inject_size = max_msg_size; | ||
info->ep_attr->max_msg_size = max_msg_size; | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. max_msg_size > 0 should be an assert. We shouldn't leave these sizes unset. |
||
} else { | ||
FI_DBG(&udpx_prov, FI_LOG_CORE, | ||
"Failed to match interface (%s, %s) to " | ||
"address for MTU size\n", | ||
info->fabric_attr->name, info->domain_attr->name); | ||
} | ||
} | ||
|
||
void udpx_util_prov_init(uint32_t version) | ||
{ | ||
|
||
struct slist addr_list; | ||
struct fi_info* cur; | ||
struct fi_info* info; | ||
|
||
if (udpx_util_prov.info == NULL) { | ||
udpx_util_prov.info = &udpx_info; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This line is redundant since it is overwritten by L153. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But |
||
info = fi_allocinfo(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Must check for failure |
||
ofi_ip_getinfo(&udpx_util_prov, version, NULL, NULL, 0, NULL, | ||
&info); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the way to handle this is to add a callback to ofi_ip_getinfo(). That function is already iterating over the interfaces. The only part that's missing is the ability of the provider to update the fi_info based on the specific interface. For most callers, the callback will be null. For UDP, the callback will receive the fi_info and ofi_addr_list_entry, so that it can update the attributes based on the mtu. The callback may need to be passed through to subfunctions. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will do that. It seems like a more extensive change to the code, but if you're looking over my shoulder, I'll give it a try. |
||
slist_init(&addr_list); | ||
ofi_get_list_of_addr(&udpx_prov, "iface", &addr_list); | ||
for (cur = info; cur; cur = cur->next) | ||
set_mtu_from_addr_list(cur, &addr_list); | ||
*(struct fi_info**)&udpx_util_prov.info = info; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can be simplified to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The |
||
ofi_free_list_of_addr(&addr_list); | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't embed the entire function inside an if statement. Revert the if check and return, then outdent the rest of the code. Please add some spaces between lines. |
||
} | ||
|
||
void udpx_util_prov_fini() | ||
{ | ||
if (udpx_util_prov.info != NULL) | ||
fi_freeinfo((struct fi_info*)udpx_util_prov.info); | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1942,7 +1942,8 @@ void ofi_free_list_of_addr(struct slist *addr_list) | |
} | ||
|
||
static inline | ||
void ofi_insert_loopback_addr(const struct fi_provider *prov, struct slist *addr_list) | ||
void ofi_insert_loopback_addr(const struct fi_provider *prov, | ||
struct slist *addr_list, int mtu) | ||
{ | ||
struct ofi_addr_list_entry *addr_entry; | ||
|
||
|
@@ -1953,6 +1954,7 @@ void ofi_insert_loopback_addr(const struct fi_provider *prov, struct slist *addr | |
addr_entry->comm_caps = FI_LOCAL_COMM; | ||
addr_entry->ipaddr.sin.sin_family = AF_INET; | ||
addr_entry->ipaddr.sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK); | ||
addr_entry->mtu = mtu; | ||
ofi_straddr_log(prov, FI_LOG_INFO, FI_LOG_CORE, | ||
"available addr: ", &addr_entry->ipaddr); | ||
|
||
|
@@ -1968,6 +1970,7 @@ void ofi_insert_loopback_addr(const struct fi_provider *prov, struct slist *addr | |
addr_entry->comm_caps = FI_LOCAL_COMM; | ||
addr_entry->ipaddr.sin6.sin6_family = AF_INET6; | ||
addr_entry->ipaddr.sin6.sin6_addr = in6addr_loopback; | ||
addr_entry->mtu = mtu; | ||
ofi_straddr_log(prov, FI_LOG_INFO, FI_LOG_CORE, | ||
"available addr: ", &addr_entry->ipaddr); | ||
|
||
|
@@ -2062,7 +2065,7 @@ void ofi_set_netmask_str(char *netstr, size_t len, struct ifaddrs *ifa) | |
void ofi_get_list_of_addr(const struct fi_provider *prov, const char *env_name, | ||
struct slist *addr_list) | ||
{ | ||
int ret; | ||
int ret, mtu = -1; | ||
char *iface = NULL; | ||
struct ofi_addr_list_entry *addr_entry; | ||
struct ifaddrs *ifaddrs, *ifa; | ||
|
@@ -2089,10 +2092,13 @@ void ofi_get_list_of_addr(const struct fi_provider *prov, const char *env_name, | |
if (ifa->ifa_addr == NULL || | ||
!(ifa->ifa_flags & IFF_UP) || | ||
!(ifa->ifa_flags & IFF_RUNNING) || | ||
(ifa->ifa_flags & IFF_LOOPBACK) || | ||
((ifa->ifa_addr->sa_family != AF_INET) && | ||
(ifa->ifa_addr->sa_family != AF_INET6))) | ||
continue; | ||
if (ifa->ifa_flags & IFF_LOOPBACK) { | ||
mtu = ofi_ifaddr_get_mtu(ifa); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. rename to loopback_mtu to make it clear how this is used. On first glance, this value looks like it's ignored. |
||
continue; | ||
} | ||
if (iface && strncmp(iface, ifa->ifa_name, strlen(iface) + 1)) { | ||
FI_DBG(prov, FI_LOG_CORE, | ||
"Skip (%s) interface\n", ifa->ifa_name); | ||
|
@@ -2122,9 +2128,11 @@ void ofi_get_list_of_addr(const struct fi_provider *prov, const char *env_name, | |
} | ||
|
||
addr_entry->speed = ofi_ifaddr_get_speed(ifa); | ||
addr_entry->mtu = ofi_ifaddr_get_mtu(ifa); | ||
FI_INFO(prov, FI_LOG_CORE, "Available addr: %s, " | ||
"iface name: %s, speed: %zu\n", | ||
addr_entry->ipstr, ifa->ifa_name, addr_entry->speed); | ||
"iface name: %s, speed: %zu, mtu: %d\n", | ||
addr_entry->ipstr, ifa->ifa_name, addr_entry->speed, | ||
addr_entry->mtu); | ||
|
||
slist_insert_before_first_match(addr_list, ofi_compare_addr_entry, | ||
&addr_entry->entry); | ||
|
@@ -2136,7 +2144,7 @@ void ofi_get_list_of_addr(const struct fi_provider *prov, const char *env_name, | |
/* Always add loopback address at the end */ | ||
if (!iface || !strncmp(iface, "lo", strlen(iface) + 1) || | ||
!strncmp(iface, "loopback", strlen(iface) + 1)) | ||
ofi_insert_loopback_addr(prov, addr_list); | ||
ofi_insert_loopback_addr(prov, addr_list, mtu); | ||
} | ||
|
||
#elif defined HAVE_MIB_IPADDRTABLE | ||
|
@@ -2174,6 +2182,7 @@ void ofi_get_list_of_addr(const struct fi_provider *prov, const char *env_name, | |
addr_entry->ipaddr.sin.sin_family = AF_INET; | ||
addr_entry->ipaddr.sin.sin_addr.s_addr = | ||
iptbl->table[i].dwAddr; | ||
addr_entry->mtu = -1; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. everywhere mtu = -1 should probably use a better default than an unusable value |
||
inet_ntop(AF_INET, &iptbl->table[i].dwAddr, | ||
addr_entry->ipstr, | ||
sizeof(addr_entry->ipstr)); | ||
|
@@ -2182,7 +2191,7 @@ void ofi_get_list_of_addr(const struct fi_provider *prov, const char *env_name, | |
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm fairly unfamiliar with Windows, which is where this code is used. Given what I see in |
||
|
||
/* Always add loopback address at the end */ | ||
ofi_insert_loopback_addr(prov, addr_list); | ||
ofi_insert_loopback_addr(prov, addr_list, -1); | ||
|
||
out: | ||
if (iptbl != &_iptbl) | ||
|
@@ -2194,7 +2203,7 @@ void ofi_get_list_of_addr(const struct fi_provider *prov, const char *env_name, | |
void ofi_get_list_of_addr(const struct fi_provider *prov, const char *env_name, | ||
struct slist *addr_list) | ||
{ | ||
ofi_insert_loopback_addr(prov, addr_list); | ||
ofi_insert_loopback_addr(prov, addr_list, -1); | ||
} | ||
#endif | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -477,6 +477,7 @@ int getifaddrs(struct ifaddrs **ifap) | |
(*addr6) = *(struct sockaddr_in6 *) pSockAddr; | ||
} | ||
fa->speed = aa->TransmitLinkSpeed; | ||
fa->mtu = (int)aa->Mtu; | ||
/* Generate fake Unix-like device names */ | ||
sprintf_s(fa->ad_name, sizeof(fa->ad_name), "eth%d", i++); | ||
} | ||
|
@@ -497,6 +498,11 @@ size_t ofi_ifaddr_get_speed(struct ifaddrs *ifa) | |
return ifa->speed; | ||
} | ||
|
||
int ofi_ifaddr_get_mtu(const struct ifaddrs *ifa) | ||
{ | ||
return ifa->mtu; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Indentation is off. |
||
} | ||
|
||
void freeifaddrs(struct ifaddrs *ifa) | ||
{ | ||
while (ifa) { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why not just return 1500 as the default? Setting mtu = -1 as a default looks like a recipe for problems and missed error checking
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for your feedback. One reason for using the value -1 is that it removes the need to put default MTU size values in a bunch more files. If there is there a good, single place to put such a definition, I must have missed it. I'm open to suggestions.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I actually disagree with Sean for once. Returning -1 is similar to an enosys indicating that finding the MTU size is not supported/possible. Returning 1500 seems like it's falsely stating that it was able to determine that to be the MTU size. I think it's more appropriate for the default to be in the caller where it defaults to 1500 if were unable to find it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Then you need to add checks everywhere that the mtu size is valid prior to using it. There are several places where mtu == -1, but that value is used anyway, with, maybe, checks coming later that the calculation is valid. If the mtu can't be determined, then remove that interface from the list. I can't tell that these changes work at all on windows.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Aaaand back to agreeing with Sean. Yes, proper error checking is absolutely necessary for that to work.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Wouldn't removing an interface whose mtu can't be determined potentially break existing client code? Doesn't the code in the main branch implicitly assume mtu=1500 for any interface supporting udp to determine the endpoint
max_msg_size
andinject_size
? Falling back to that behavior seems like it might be safer (from that perspective) than removing the interface from the list. From a correctness point of view, dropping the interface seems preferable. For my use case, either choice is fine, so I could use some advice about how to proceed.