From 17a2b90880abb4072620ac415a10458977e68ee6 Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Fri, 20 Dec 2024 04:30:47 +0000 Subject: [PATCH 1/5] Add tests/unit/show_tuner_decisions to gitignore. Signed-off-by: Brian Barrett --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index d295f4b50..03424e0f0 100644 --- a/.gitignore +++ b/.gitignore @@ -28,6 +28,7 @@ tests/unit/show_tuner_costs tests/unit/ep_addr_list tests/unit/mr tests/unit/region_based_tuner +tests/unit/show_tuner_decisions # http://www.gnu.org/software/automake .deps/ From 80e4ab0a33a1d410a1bf02dc4a7313259fc5f6ce Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Fri, 20 Dec 2024 04:34:31 +0000 Subject: [PATCH 2/5] Refactor platform-aws instance config lookup Refactor the get_platform_data() call to expose the critical pieces to allow for unit testing. Signed-off-by: Brian Barrett --- include/Makefile.am | 3 +- include/platform-aws.h | 58 +++++++++++++++++++++++++ src/platform-aws.c | 97 ++++++++++++++++++++++++++---------------- 3 files changed, 121 insertions(+), 37 deletions(-) create mode 100644 include/platform-aws.h diff --git a/include/Makefile.am b/include/Makefile.am index c3150d615..2303e4437 100644 --- a/include/Makefile.am +++ b/include/Makefile.am @@ -38,4 +38,5 @@ noinst_HEADERS = \ tracing_impl/nvtx.h \ internal/tuner/nccl_defaults.h \ nccl_ofi_platform.h \ - nccl_ofi_ep_addr_list.h + nccl_ofi_ep_addr_list.h \ + platform-aws.h diff --git a/include/platform-aws.h b/include/platform-aws.h new file mode 100644 index 000000000..fedbfe7f5 --- /dev/null +++ b/include/platform-aws.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All rights reserved. + * + * Access helper functions from platform-aws specifically for unit + * tests. You do not want to include this file outside of + * platform-aws.c or a unit test, or you'll break linking on non-AWS + * platforms. + */ + +#ifndef PLATFORM_AWS_H_ +#define PLATFORM_AWS_H_ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + + +struct ec2_platform_data { + const char* name; + const char* topology; + int default_dup_conns; + float latency; + bool gdr_required; + bool net_flush_required; + const char *default_protocol; + int domain_per_thread; +}; + + +/* + * @brief Get the platform data map + * + * This function exists solely to test + * platform_aws_get_platform_entry() against the production data map. + */ +struct ec2_platform_data *platform_aws_get_platform_map(size_t *len); + + +/* + * @brief Returns platform data for current platform type, if found + * + * @input Platform type + * + * @return NULL, if no topology found + * platform data, if match found + */ +struct ec2_platform_data *platform_aws_get_platform_entry(const char *platform_type, + struct ec2_platform_data *platform_data_list, + size_t platform_data_len); + + +#ifdef __cplusplus +} // End extern "C" +#endif + +#endif // End NCCL_OFI_H_ diff --git a/src/platform-aws.c b/src/platform-aws.c index 4d093de78..e351d032a 100644 --- a/src/platform-aws.c +++ b/src/platform-aws.c @@ -20,6 +20,7 @@ #include "nccl_ofi.h" #include "nccl_ofi_platform.h" +#include "platform-aws.h" #include "nccl_ofi_log.h" #include "nccl_ofi_math.h" #include "nccl_ofi_rdma.h" @@ -27,16 +28,6 @@ #include "nccl_ofi_pthread.h" #include "nccl_ofi_system.h" -struct ec2_platform_data { - const char* name; - const char* topology; - int default_dup_conns; - float latency; - bool gdr_required; - bool net_flush_required; - const char *default_protocol; - int domain_per_thread; -}; /* * platform_data_map is an ordered list of platform entries. The @@ -118,12 +109,62 @@ static struct ec2_platform_data platform_data_map[] = { }; +struct ec2_platform_data *platform_aws_get_platform_map(size_t *len) +{ + *len = sizeof(platform_data_map)/sizeof(platform_data_map[0]); + return platform_data_map; +} + + +/* + * internal function (exported for unit test purposes) for finding the + * correct platform data entry. You should use + * platform_Aws_get_platform_data() so that you get caching and all + * that niceness. + */ +struct ec2_platform_data *platform_aws_get_platform_entry(const char *platform_type, + struct ec2_platform_data *platform_data_list, + size_t platform_data_len) +{ + struct ec2_platform_data *response = NULL; + regex_t regex; + int ret; + + for (size_t idx = 0; idx < platform_data_len; idx++) { + ret = regcomp(®ex, platform_data_list[idx].name, 0); + if (ret != 0) { + NCCL_OFI_WARN("Could not compile platform_type regex for %s", + platform_data_list[idx].name); + goto done; + } + + ret = regexec(®ex, platform_type, 0, NULL, 0); + + regfree(®ex); + + if (ret == 0) { + response = &platform_data_list[idx]; + break; + } else if (ret != REG_NOMATCH) { + NCCL_OFI_WARN("Regex match failed"); + goto done; + } + } + +done: + NCCL_OFI_TRACE(NCCL_NET | NCCL_INIT, "Using platform block %s for instance type %s", + (response == NULL) ? "none" : response->name, platform_type); + + return response; +} + + /* * @brief Returns platform data for current platform type, if found * - * @input Platform type + * @input none * - * @return NULL, if no topology found + * @return NULL, if no entry found * platform data, if match found */ static struct ec2_platform_data *get_platform_data(void) @@ -131,10 +172,9 @@ static struct ec2_platform_data *get_platform_data(void) static bool init = false; static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; static struct ec2_platform_data *platform_data = NULL; - const size_t platform_n = sizeof(platform_data_map)/sizeof(platform_data_map[0]); const char* platform_type = NULL; - regex_t regex; - int ret; + struct ec2_platform_data *platform_data_list; + size_t platform_data_len; nccl_net_ofi_mutex_lock(&mutex); @@ -148,29 +188,14 @@ static struct ec2_platform_data *get_platform_data(void) goto done; } - for (size_t idx = 0; idx < platform_n; idx++) { - ret = regcomp(®ex, platform_data_map[idx].name, 0); - if (ret != 0) { - NCCL_OFI_WARN("Could not compile platform_type regex for %s", - platform_data_map[idx].name); - goto done; - } - - ret = regexec(®ex, platform_type, 0, NULL, 0); - - regfree(®ex); - - if (ret == 0) { - platform_data = &platform_data_map[idx]; - break; - } else if (ret != REG_NOMATCH) { - NCCL_OFI_WARN("Regex match failed"); - goto done; - } + platform_data_list = platform_aws_get_platform_map(&platform_data_len); + if (platform_data_list == NULL) { + goto done; } - NCCL_OFI_TRACE(NCCL_NET | NCCL_INIT, "Using platform block %s for instance type %s", - (platform_data == NULL) ? "none" : platform_data->name, platform_type); + platform_data = platform_aws_get_platform_entry(platform_type, platform_data_list, + platform_data_len); + done: nccl_net_ofi_mutex_unlock(&mutex); From eb41ab39cca323b9409896ce8917a3124c2998dd Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Fri, 20 Dec 2024 04:35:49 +0000 Subject: [PATCH 3/5] tests: Add platform-aws config lookup test Writing regexes is hard. Making sure your regexes work without a test is harder. So we should have a unit test for the code to do instance type -> config block lookup. Signed-off-by: Brian Barrett --- .gitignore | 1 + tests/unit/Makefile.am | 5 ++ tests/unit/aws_platform_mapper.c | 102 +++++++++++++++++++++++++++++++ 3 files changed, 108 insertions(+) create mode 100644 tests/unit/aws_platform_mapper.c diff --git a/.gitignore b/.gitignore index 03424e0f0..e073de840 100644 --- a/.gitignore +++ b/.gitignore @@ -29,6 +29,7 @@ tests/unit/ep_addr_list tests/unit/mr tests/unit/region_based_tuner tests/unit/show_tuner_decisions +tests/unit/aws_platform_mapper # http://www.gnu.org/software/automake .deps/ diff --git a/tests/unit/Makefile.am b/tests/unit/Makefile.am index ab4b48669..05b91ac99 100644 --- a/tests/unit/Makefile.am +++ b/tests/unit/Makefile.am @@ -20,6 +20,10 @@ noinst_PROGRAMS = \ ep_addr_list \ mr +if WANT_PLATFORM_AWS +noinst_PROGRAMS += aws_platform_mapper +endif + if !ENABLE_NEURON if WANT_PLATFORM_AWS AM_LDFLAGS = $(CUDA_LDFLAGS) @@ -38,6 +42,7 @@ msgbuff_SOURCES = msgbuff.c scheduler_SOURCES = scheduler.c ep_addr_list_SOURCES = ep_addr_list.c mr_SOURCES = mr.c +aws_platform_mapper_SOURCES = aws_platform_mapper.c TESTS = $(noinst_PROGRAMS) endif diff --git a/tests/unit/aws_platform_mapper.c b/tests/unit/aws_platform_mapper.c new file mode 100644 index 000000000..1b6a14cdc --- /dev/null +++ b/tests/unit/aws_platform_mapper.c @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#include "config.h" + +#include +#include + +#include "platform-aws.h" + + +/* check that we get the expected response for all our known platforms */ +static int check_value(struct ec2_platform_data *platform_data_list, size_t len, + const char *platform_type, const char *expected_value) +{ + struct ec2_platform_data *entry = platform_aws_get_platform_entry(platform_type, + platform_data_list, + len); + + if (NULL == entry && expected_value != NULL) { + printf("Got NULL reply, expected %s\n", expected_value); + return 1; + } else if (NULL != entry && expected_value == NULL) { + printf("Got reply %s, expected NULL\n", entry->name); + return 1; + } else if (NULL == entry && expected_value == NULL) { + return 0; + } else if (0 != strcmp(entry->name, expected_value)) { + printf("Got reply %s, expected %s\n", entry->name, expected_value); + return 1; + } + + return 0; +} + + +static int check_known_platforms(void) +{ + struct ec2_platform_data *platform_data_list; + size_t len; + int ret = 0; + + platform_data_list = platform_aws_get_platform_map(&len); + + ret += check_value(platform_data_list, len, "trn1.32xlarge", "^trn1.*"); + ret += check_value(platform_data_list, len, "trn1n.32xlarge", "^trn1.*"); + ret += check_value(platform_data_list, len, "trn1.2xlarge", "^trn1.*"); + ret += check_value(platform_data_list, len, "trn2.48xlarge", "^trn2.*"); + ret += check_value(platform_data_list, len, "trn2u.48xlarge", "^trn2.*"); + ret += check_value(platform_data_list, len, "inf2.48xlarge", NULL); + ret += check_value(platform_data_list, len, "p3.2xlarge", NULL); + ret += check_value(platform_data_list, len, "p3.8xlarge", NULL); + ret += check_value(platform_data_list, len, "p3.16xlarge", NULL); + ret += check_value(platform_data_list, len, "p3dn.24xlarge", "^p3dn.24xlarge$"); + ret += check_value(platform_data_list, len, "p4d.24xlarge", "^p4d.24xlarge$"); + ret += check_value(platform_data_list, len, "p4de.24xlarge", "^p4de.24xlarge$"); + ret += check_value(platform_data_list, len, "p5.48xlarge", "^p5.*"); + ret += check_value(platform_data_list, len, "p5e.48xlarge", "^p5.*"); + ret += check_value(platform_data_list, len, "p5en.48xlarge", "^p5.*"); + ret += check_value(platform_data_list, len, "g5.48xlarge", "^g5.48xlarge$"); + ret += check_value(platform_data_list, len, "g6.16xlarge", NULL); + + return ret; +} + + +static struct ec2_platform_data test_map_1[] = { + { + .name = "^platform-x$", + .topology = NULL, + .default_dup_conns = 0, + .latency = 0.0, + .gdr_required = false, + .net_flush_required = false, + .default_protocol = NULL, + .domain_per_thread = 0, + }, + { + .name = "^platform.*", + .topology = NULL, + .default_dup_conns = 0, + .latency = 0.0, + .gdr_required = false, + .net_flush_required = false, + .default_protocol = NULL, + .domain_per_thread = 0, + }, +}; + +int main(int argc, char *argv[]) { + int ret = 0; + + /* verify we get the answer we want on real platforms */ + ret += check_known_platforms(); + + /* make sure we maintain ordering */ + ret += check_value(test_map_1, 2, "platform-x", "^platform-x$"); + ret += check_value(test_map_1, 2, "platform-xy", "^platform.*"); + + return ret; +} From 7f6e0e50cd9ade94358ada8a68d74925e3e7f85b Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Fri, 20 Dec 2024 05:11:47 +0000 Subject: [PATCH 4/5] aws: Improve platform regex matching Add an explicit (optional) field for the platform regex, and make the name an id rather than a regex. This means that we can change the regex without having to update all the unit tests to match. Signed-off-by: Brian Barrett --- include/platform-aws.h | 1 + src/platform-aws.c | 57 ++++++++++++++++++++------------ tests/unit/aws_platform_mapper.c | 34 ++++++++++--------- 3 files changed, 55 insertions(+), 37 deletions(-) diff --git a/include/platform-aws.h b/include/platform-aws.h index fedbfe7f5..5b7257658 100644 --- a/include/platform-aws.h +++ b/include/platform-aws.h @@ -19,6 +19,7 @@ extern "C" { struct ec2_platform_data { const char* name; + const char* regex; const char* topology; int default_dup_conns; float latency; diff --git a/src/platform-aws.c b/src/platform-aws.c index e351d032a..3cf3f58f2 100644 --- a/src/platform-aws.c +++ b/src/platform-aws.c @@ -37,7 +37,8 @@ */ static struct ec2_platform_data platform_data_map[] = { { - .name = "^p4d.24xlarge$", + .name = "p4d.24xlarge", + .regex = NULL, .topology = "p4d-24xl-topo.xml", .default_dup_conns = 0, .latency = 75.0, @@ -47,7 +48,8 @@ static struct ec2_platform_data platform_data_map[] = { .domain_per_thread = 0, }, { - .name = "^p4de.24xlarge$", + .name = "p4de.24xlarge", + .regex = NULL, .topology = "p4de-24xl-topo.xml", .default_dup_conns = 0, .latency = 75.0, @@ -57,7 +59,8 @@ static struct ec2_platform_data platform_data_map[] = { .domain_per_thread = 0, }, { - .name = "^p3dn.24xlarge$", + .name = "p3dn.24xlarge", + .regex = NULL, .topology = NULL, .default_dup_conns = 4, .latency = 150.0, @@ -67,7 +70,8 @@ static struct ec2_platform_data platform_data_map[] = { .domain_per_thread = 0, }, { - .name = "^p5.*", + .name = "p-series", + .regex = "^p5.*", .topology = NULL, .default_dup_conns = 0, .latency = 75.0, @@ -77,7 +81,8 @@ static struct ec2_platform_data platform_data_map[] = { .domain_per_thread = 0, }, { - .name = "^g5.48xlarge$", + .name = "g5.48xlarge", + .regex = NULL, .topology = "g5.48xl-topo.xml", .default_dup_conns = 0, .latency = 75.0, @@ -87,7 +92,8 @@ static struct ec2_platform_data platform_data_map[] = { .domain_per_thread = 0, }, { - .name = "^trn1.*", + .name = "trn1", + .regex = "^trn1.*", .topology = NULL, .default_dup_conns = 0, .latency = 75.0, @@ -97,7 +103,8 @@ static struct ec2_platform_data platform_data_map[] = { .domain_per_thread = 1, }, { - .name = "^trn2.*", + .name = "trn2", + .regex = "^trn2.*", .topology = NULL, .default_dup_conns = 0, .latency = 75.0, @@ -131,23 +138,31 @@ struct ec2_platform_data *platform_aws_get_platform_entry(const char *platform_t int ret; for (size_t idx = 0; idx < platform_data_len; idx++) { - ret = regcomp(®ex, platform_data_list[idx].name, 0); - if (ret != 0) { - NCCL_OFI_WARN("Could not compile platform_type regex for %s", - platform_data_list[idx].name); - goto done; - } + if (platform_data_list[idx].regex == NULL) { + if (0 == strcmp(platform_type, + platform_data_list[idx].name)) { + response = &platform_data_list[idx]; + break; + } + } else { + ret = regcomp(®ex, platform_data_list[idx].regex, 0); + if (ret != 0) { + NCCL_OFI_WARN("Could not compile platform_type regex for %s", + platform_data_list[idx].regex); + goto done; + } - ret = regexec(®ex, platform_type, 0, NULL, 0); + ret = regexec(®ex, platform_type, 0, NULL, 0); - regfree(®ex); + regfree(®ex); - if (ret == 0) { - response = &platform_data_list[idx]; - break; - } else if (ret != REG_NOMATCH) { - NCCL_OFI_WARN("Regex match failed"); - goto done; + if (ret == 0) { + response = &platform_data_list[idx]; + break; + } else if (ret != REG_NOMATCH) { + NCCL_OFI_WARN("Regex match failed"); + goto done; + } } } diff --git a/tests/unit/aws_platform_mapper.c b/tests/unit/aws_platform_mapper.c index 1b6a14cdc..3885d01c7 100644 --- a/tests/unit/aws_platform_mapper.c +++ b/tests/unit/aws_platform_mapper.c @@ -43,22 +43,22 @@ static int check_known_platforms(void) platform_data_list = platform_aws_get_platform_map(&len); - ret += check_value(platform_data_list, len, "trn1.32xlarge", "^trn1.*"); - ret += check_value(platform_data_list, len, "trn1n.32xlarge", "^trn1.*"); - ret += check_value(platform_data_list, len, "trn1.2xlarge", "^trn1.*"); - ret += check_value(platform_data_list, len, "trn2.48xlarge", "^trn2.*"); - ret += check_value(platform_data_list, len, "trn2u.48xlarge", "^trn2.*"); + ret += check_value(platform_data_list, len, "trn1.32xlarge", "trn1"); + ret += check_value(platform_data_list, len, "trn1n.32xlarge", "trn1"); + ret += check_value(platform_data_list, len, "trn1.2xlarge", "trn1"); + ret += check_value(platform_data_list, len, "trn2.48xlarge", "trn2"); + ret += check_value(platform_data_list, len, "trn2u.48xlarge", "trn2"); ret += check_value(platform_data_list, len, "inf2.48xlarge", NULL); ret += check_value(platform_data_list, len, "p3.2xlarge", NULL); ret += check_value(platform_data_list, len, "p3.8xlarge", NULL); ret += check_value(platform_data_list, len, "p3.16xlarge", NULL); - ret += check_value(platform_data_list, len, "p3dn.24xlarge", "^p3dn.24xlarge$"); - ret += check_value(platform_data_list, len, "p4d.24xlarge", "^p4d.24xlarge$"); - ret += check_value(platform_data_list, len, "p4de.24xlarge", "^p4de.24xlarge$"); - ret += check_value(platform_data_list, len, "p5.48xlarge", "^p5.*"); - ret += check_value(platform_data_list, len, "p5e.48xlarge", "^p5.*"); - ret += check_value(platform_data_list, len, "p5en.48xlarge", "^p5.*"); - ret += check_value(platform_data_list, len, "g5.48xlarge", "^g5.48xlarge$"); + ret += check_value(platform_data_list, len, "p3dn.24xlarge", "p3dn.24xlarge"); + ret += check_value(platform_data_list, len, "p4d.24xlarge", "p4d.24xlarge"); + ret += check_value(platform_data_list, len, "p4de.24xlarge", "p4de.24xlarge"); + ret += check_value(platform_data_list, len, "p5.48xlarge", "p-series"); + ret += check_value(platform_data_list, len, "p5e.48xlarge", "p-series"); + ret += check_value(platform_data_list, len, "p5en.48xlarge", "p-series"); + ret += check_value(platform_data_list, len, "g5.48xlarge", "g5.48xlarge"); ret += check_value(platform_data_list, len, "g6.16xlarge", NULL); return ret; @@ -67,7 +67,8 @@ static int check_known_platforms(void) static struct ec2_platform_data test_map_1[] = { { - .name = "^platform-x$", + .name = "first", + .regex = "^platform-x$", .topology = NULL, .default_dup_conns = 0, .latency = 0.0, @@ -77,7 +78,8 @@ static struct ec2_platform_data test_map_1[] = { .domain_per_thread = 0, }, { - .name = "^platform.*", + .name = "second", + .regex = "^platform.*", .topology = NULL, .default_dup_conns = 0, .latency = 0.0, @@ -95,8 +97,8 @@ int main(int argc, char *argv[]) { ret += check_known_platforms(); /* make sure we maintain ordering */ - ret += check_value(test_map_1, 2, "platform-x", "^platform-x$"); - ret += check_value(test_map_1, 2, "platform-xy", "^platform.*"); + ret += check_value(test_map_1, 2, "platform-x", "first"); + ret += check_value(test_map_1, 2, "platform-xy", "second"); return ret; } From 0fea14658fdc21cf3b64c816dc62564a461bab1c Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Fri, 20 Dec 2024 05:16:45 +0000 Subject: [PATCH 5/5] aws: Extend p5 regex to future p-series Add a regex to match all instances p5 or later in the p-series. Moved to extended regex compilation to make the regex slightly more sane. Signed-off-by: Brian Barrett --- src/platform-aws.c | 9 +++++++-- tests/unit/aws_platform_mapper.c | 3 +++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/platform-aws.c b/src/platform-aws.c index 3cf3f58f2..dafc4ea2f 100644 --- a/src/platform-aws.c +++ b/src/platform-aws.c @@ -71,7 +71,12 @@ static struct ec2_platform_data platform_data_map[] = { }, { .name = "p-series", - .regex = "^p5.*", + /* + * we only want to match P5 and later, as earlier + * platforms all either need to be ignored or special + * cased. + */ + .regex = "^p([5-9]|[0-9]{2,}).*", .topology = NULL, .default_dup_conns = 0, .latency = 75.0, @@ -145,7 +150,7 @@ struct ec2_platform_data *platform_aws_get_platform_entry(const char *platform_t break; } } else { - ret = regcomp(®ex, platform_data_list[idx].regex, 0); + ret = regcomp(®ex, platform_data_list[idx].regex, REG_EXTENDED); if (ret != 0) { NCCL_OFI_WARN("Could not compile platform_type regex for %s", platform_data_list[idx].regex); diff --git a/tests/unit/aws_platform_mapper.c b/tests/unit/aws_platform_mapper.c index 3885d01c7..d2dce149e 100644 --- a/tests/unit/aws_platform_mapper.c +++ b/tests/unit/aws_platform_mapper.c @@ -61,6 +61,9 @@ static int check_known_platforms(void) ret += check_value(platform_data_list, len, "g5.48xlarge", "g5.48xlarge"); ret += check_value(platform_data_list, len, "g6.16xlarge", NULL); + // obviously future platforms + ret += check_value(platform_data_list, len, "p100.2048xlarge", "p-series"); + return ret; }