From 8e025a20092130a02bb432d23d9f29522a8edd8b Mon Sep 17 00:00:00 2001 From: Jiakun Yan Date: Mon, 9 Dec 2024 20:00:58 -0600 Subject: [PATCH] packet pool: add option to control global/per-device packet pool (LCI_USE_GLOBAL_PACKET_POOL) --- lci/api/lci.h | 6 ++++++ lci/runtime/device.c | 11 ++++++++++- lci/runtime/env.c | 2 ++ lci/runtime/lci.c | 4 ++-- lci/runtime/lcii.h | 2 ++ lci/runtime/packet_pool.h | 2 +- 6 files changed, 23 insertions(+), 4 deletions(-) diff --git a/lci/api/lci.h b/lci/api/lci.h index e0584130..ab2d6bdc 100644 --- a/lci/api/lci.h +++ b/lci/api/lci.h @@ -624,6 +624,12 @@ extern bool LCI_UCX_USE_TRY_LOCK; */ extern bool LCI_UCX_PROGRESS_FOCUSED; +/** + * @ingroup LCI_DEVICE + * @brief Whether to use global packet pool. + */ +extern bool LCI_USE_GLOBAL_PACKET_POOL; + /** * @ingroup LCI_COMM * @brief Try_lock mode of network backend. diff --git a/lci/runtime/device.c b/lci/runtime/device.c index e351fa76..96c2d06a 100644 --- a/lci/runtime/device.c +++ b/lci/runtime/device.c @@ -60,7 +60,12 @@ LCI_error_t LCI_device_init(LCI_device_t* device_ptr) LCII_bq_init(&device->bq); LCIU_spinlock_init(&device->bq_spinlock); - device->heap = &g_heap; + if (LCI_USE_GLOBAL_PACKET_POOL) { + device->heap = &g_heap; + } else { + device->heap = LCIU_malloc(sizeof(LCII_packet_heap_t)); + initialize_packet_heap(device->heap); + } LCI_memory_register(device, device->heap->address, device->heap->length, &device->heap_segment); @@ -89,6 +94,10 @@ LCI_error_t LCI_device_free(LCI_device_t* device_ptr) LCII_endpoint_get_recv_posted(device->endpoint_progress); } LCI_memory_deregister(&device->heap_segment); + if (!LCI_USE_GLOBAL_PACKET_POOL) { + finalize_packet_heap(device->heap); + LCIU_free(device->heap); + } LCII_matchtable_free(&device->mt); LCM_archive_fini(&(device->ctx_archive)); LCII_bq_fini(&device->bq); diff --git a/lci/runtime/env.c b/lci/runtime/env.c index fbe59aa4..f0470868 100644 --- a/lci/runtime/env.c +++ b/lci/runtime/env.c @@ -34,6 +34,7 @@ LCI_API bool LCI_OFI_CXI_TRY_NO_HACK; LCI_API uint64_t LCI_BACKEND_TRY_LOCK_MODE; LCI_API bool LCI_UCX_USE_TRY_LOCK; LCI_API bool LCI_UCX_PROGRESS_FOCUSED; +LCI_API bool LCI_USE_GLOBAL_PACKET_POOL; LCI_API LCI_device_t LCI_UR_DEVICE; LCI_API LCI_endpoint_t LCI_UR_ENDPOINT; LCI_API LCI_comp_t LCI_UR_CQ; @@ -129,6 +130,7 @@ void LCII_env_init(int num_proc, int rank) } LCI_UCX_USE_TRY_LOCK = LCIU_getenv_or("LCI_UCX_USE_TRY_LOCK", 0); LCI_UCX_PROGRESS_FOCUSED = LCIU_getenv_or("LCI_UCX_PROGRESS_FOCUSED", 0); + LCI_USE_GLOBAL_PACKET_POOL = LCIU_getenv_or("LCI_USE_GLOBAL_PACKET_POOL", 1); if (LCI_UCX_PROGRESS_FOCUSED) LCI_UCX_USE_TRY_LOCK = true; LCII_env_init_cq_type(); LCII_env_init_rdv_protocol(); diff --git a/lci/runtime/lci.c b/lci/runtime/lci.c index d69dc834..2d5c5d76 100644 --- a/lci/runtime/lci.c +++ b/lci/runtime/lci.c @@ -79,7 +79,7 @@ LCI_error_t LCI_initialize() } // initialize global data structure LCIS_server_init(&g_server); - initialize_packet_heap(&g_heap); + if (LCI_USE_GLOBAL_PACKET_POOL) initialize_packet_heap(&g_heap); // UR objects LCI_device_init(&LCI_UR_DEVICE); LCI_queue_create(LCI_UR_DEVICE, &LCI_UR_CQ); @@ -111,7 +111,7 @@ LCI_error_t LCI_finalize() LCI_queue_free(&LCI_UR_CQ); LCI_device_free(&LCI_UR_DEVICE); LCIS_server_fina(g_server); - finalize_packet_heap(&g_heap); + if (LCI_USE_GLOBAL_PACKET_POOL) finalize_packet_heap(&g_heap); if (LCI_USE_DREG) { #ifdef LCI_COMPILE_DREG LCII_ucs_cleanup(); diff --git a/lci/runtime/lcii.h b/lci/runtime/lcii.h index 7b476a4c..1302c2a0 100644 --- a/lci/runtime/lcii.h +++ b/lci/runtime/lcii.h @@ -66,6 +66,8 @@ struct LCII_packet_heap_t { int total_recv_posted; // for debugging purpose }; typedef struct LCII_packet_heap_t LCII_packet_heap_t; +extern void initialize_packet_heap(LCII_packet_heap_t* heap); +extern void finalize_packet_heap(LCII_packet_heap_t* heap); extern LCIS_server_t g_server; extern LCII_packet_heap_t g_heap; diff --git a/lci/runtime/packet_pool.h b/lci/runtime/packet_pool.h index 82676bcf..909c84f2 100644 --- a/lci/runtime/packet_pool.h +++ b/lci/runtime/packet_pool.h @@ -10,7 +10,7 @@ #include #define MAX_NPOOLS 272 -#define MAX_LOCAL_POOL 32 // align to a cache line. +#define MAX_LOCAL_POOL 128 extern int LCII_pool_nkey; extern int32_t LCII_tls_pool_metadata[MAX_NPOOLS][MAX_LOCAL_POOL];