Skip to content

Commit

Permalink
freelist: separate out metadata from user data
Browse files Browse the repository at this point in the history
Updates the interface of freelist to return an elem struct that contains
a buffer pointer and (optionally) memory registration info. Also updates
sendrecv, rdma, and scheduler codes to use the new interface.

This separation paves the way for storing freelist user data in GPU
memory.

Signed-off-by: Eric Raut <[email protected]>
  • Loading branch information
rauteric committed Dec 18, 2024
1 parent dc5156c commit 2e26a5f
Show file tree
Hide file tree
Showing 9 changed files with 226 additions and 270 deletions.
126 changes: 26 additions & 100 deletions include/nccl_ofi_freelist.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,13 @@ extern "C" {
#include "nccl_ofi_pthread.h"

/*
* Internal: freelist element structure, only has meaning when the
* element is in the freelist (as opposed to owned by the user). Will
* be the first N bytes of the element buffer when not using memory
* registration.
* Freelist element structure
*/
struct nccl_ofi_freelist_elem_t {
typedef struct nccl_ofi_freelist_elem {
void *ptr;
struct nccl_ofi_freelist_elem_t *next;
};
void *mr_handle;
struct nccl_ofi_freelist_elem *next;
} nccl_ofi_freelist_elem_t;

/*
* Internal: tracking data for blocks of allocated memory
Expand All @@ -38,6 +36,7 @@ struct nccl_ofi_freelist_block_t {
void *memory;
size_t memory_size;
void *mr_handle;
nccl_ofi_freelist_elem_t *entries;
};

/*
Expand Down Expand Up @@ -74,33 +73,6 @@ typedef int (*nccl_ofi_freelist_regmr_fn)(void *opaque, void *data, size_t size,
*/
typedef int (*nccl_ofi_freelist_deregmr_fn)(void *handle);

/*
* Structure describing the registration information for the freelist
* item returned by nccl_ofi_freelist_entry_alloc. The object being
* managed by the freelist (such as a control buffer) should contain
* this structure starting reginfo_offset bytes from the base of the
* structure. The fields should not be modified by the caller.
*/
struct nccl_ofi_freelist_reginfo_t {
/* elem must be the first entry in reginfo_t, and should be
ignored by the caller */
struct nccl_ofi_freelist_elem_t elem;
/* offset from the start of the memory registration for the
start of this buffer */
size_t base_offset;
void *mr_handle;
/* Redzone at the end of this structure. redzone must be the
* last entry in reginfo_t, and should be ignored by the
* caller */
char redzone[MEMCHECK_REDZONE_SIZE];
};
typedef struct nccl_ofi_freelist_reginfo_t nccl_ofi_freelist_reginfo_t;

static_assert(offsetof(nccl_ofi_freelist_reginfo_t, elem) == 0,
"elem is not the first member of the structure nccl_ofi_freelist_reginfo_t");
static_assert(sizeof(nccl_ofi_freelist_reginfo_t) - offsetof(nccl_ofi_freelist_reginfo_t, redzone) == MEMCHECK_REDZONE_SIZE,
"redzone is not the last member of the structure nccl_ofi_freelist_reginfo_t");

/*
* Freelist structure
*
Expand All @@ -114,14 +86,13 @@ struct nccl_ofi_freelist_t {
size_t max_entry_count;
size_t increase_entry_count;

struct nccl_ofi_freelist_elem_t *entries;
nccl_ofi_freelist_elem_t *entries;
struct nccl_ofi_freelist_block_t *blocks;

bool have_reginfo;
nccl_ofi_freelist_regmr_fn regmr_fn;
nccl_ofi_freelist_deregmr_fn deregmr_fn;
void *regmr_opaque;
size_t reginfo_offset;

size_t memcheck_redzone_size;

Expand Down Expand Up @@ -158,15 +129,8 @@ int nccl_ofi_freelist_init(size_t entry_size,
* own memory registration, allowing the freelist to grow over time
* similar to the simple freelist.
*
* Unlike simple freelists, the complex freelist imposes a
* restriction on the item stored in the freelist. The item must
* contain a nccl_ofi_freelist_reginfo_t structure reginfo_offset
* bytes into the structure. The mr_handle field of the reginfo_t
* structure will contain the handle returned from regmr_fn() being
* called for the allocation block and the base_offset field will
* contain the offset (in bytes) from the start of the memory
* registartion to the start of the returned freelist entry, allowing
* for use with providers that require 0-based registration accesses.
* The mr_handle field of the elem structure will contain the handle
* returned from regmr_fn() being called for the allocation block.
*/
int nccl_ofi_freelist_init_mr(size_t entry_size,
size_t initial_entry_count,
Expand All @@ -175,7 +139,6 @@ int nccl_ofi_freelist_init_mr(size_t entry_size,
nccl_ofi_freelist_regmr_fn regmr_fn,
nccl_ofi_freelist_deregmr_fn deregmr_fn,
void *regmr_opaque,
size_t reginfo_offset,
size_t entry_alignment,
nccl_ofi_freelist_t **freelist_p);

Expand All @@ -201,36 +164,10 @@ static inline void nccl_ofi_freelist_entry_set_undefined(nccl_ofi_freelist_t *fr
{
size_t user_entry_size = freelist->entry_size - MEMCHECK_REDZONE_SIZE;

if (freelist->have_reginfo) {
size_t reginfo_offset = freelist->reginfo_offset;
size_t elem_size = sizeof(struct nccl_ofi_freelist_elem_t);
size_t reginfo_size = sizeof(struct nccl_ofi_freelist_reginfo_t);
size_t redzone_offset = offsetof(struct nccl_ofi_freelist_reginfo_t, redzone);

/* Entry after reginfo structure is accessible but undefined */
nccl_net_ofi_mem_undefined_unaligned((void*)((uintptr_t)entry_p + reginfo_offset + reginfo_size),
user_entry_size - reginfo_offset - reginfo_size);
/* Redzone at the end of the reginfo structure is
* marked as not accessible */
nccl_net_ofi_mem_noaccess_unaligned((void*)((uintptr_t)entry_p + reginfo_offset + redzone_offset),
MEMCHECK_REDZONE_SIZE);
/* Members of reginfo structure except first and last
* member are accessible and defined */
nccl_net_ofi_mem_defined_unaligned((void*)((uintptr_t)entry_p + reginfo_offset + elem_size),
redzone_offset - elem_size);
/* First member of reginfo structure, i.e.,
* nccl_ofi_freelist_elem_t structure, is marked as
* not accessible */
nccl_net_ofi_mem_noaccess_unaligned((void*)((uintptr_t)entry_p + reginfo_offset), elem_size);
/* First part of entry until reginfo structure is
* accessible but undefined */
nccl_net_ofi_mem_undefined(entry_p, reginfo_offset);
} else {
/* Entry allocated by the user is accessible but
* undefined. Note that this allows the user to
* override the nccl_ofi_freelist_elem_t structure. */
nccl_net_ofi_mem_undefined(entry_p, user_entry_size);
}
/* Entry allocated by the user is accessible but
* undefined. Note that this allows the user to
* override the nccl_ofi_freelist_elem_t structure. */
nccl_net_ofi_mem_undefined(entry_p, user_entry_size);
}

/* Allocate a new freelist item
Expand All @@ -244,18 +181,16 @@ static inline void nccl_ofi_freelist_entry_set_undefined(nccl_ofi_freelist_t *fr
* have previously been allocated and either the freelist has reached
* maximum size or the allocation to grow the freelist has failed.
*
* Regardless of freelist type, the pointer returned will be to the
* first byte in the freelist item. If using complex freelists, the
* reginfo_t structure that is a memory of the freelist item will
* contain valid information for the mr_handle and base_offset
* fields. The caller should not write into the bytes covered by the
* reginfo_t structure.
* The pointer returned will be to a nccl_ofi_freelist_elem_t structure that
* contains the pointer and memory registration. For complex freelists,
* the elem_t structure will contain valid information for the mr_handle. The
* caller should not write into the bytes covered by the elem_t structure.
*/
static inline void *nccl_ofi_freelist_entry_alloc(nccl_ofi_freelist_t *freelist)
static inline nccl_ofi_freelist_elem_t *nccl_ofi_freelist_entry_alloc
(nccl_ofi_freelist_t *freelist)
{
int ret;
struct nccl_ofi_freelist_elem_t *entry;
void *buf = NULL;
nccl_ofi_freelist_elem_t *entry = NULL;

assert(freelist);

Expand All @@ -273,13 +208,12 @@ static inline void *nccl_ofi_freelist_entry_alloc(nccl_ofi_freelist_t *freelist)
nccl_net_ofi_mem_defined_unaligned(entry, sizeof(*entry));

freelist->entries = entry->next;
buf = entry->ptr;
nccl_ofi_freelist_entry_set_undefined(freelist, buf);
nccl_ofi_freelist_entry_set_undefined(freelist, entry->ptr);

cleanup:
nccl_net_ofi_mutex_unlock(&freelist->lock);

return buf;
return entry;
}

/* Release a freelist item
Expand All @@ -289,28 +223,20 @@ static inline void *nccl_ofi_freelist_entry_alloc(nccl_ofi_freelist_t *freelist)
* entry_p, as corruption may result. Locking to protect the freelist
* is not required by the caller.
*/
static inline void nccl_ofi_freelist_entry_free(nccl_ofi_freelist_t *freelist, void *entry_p)
static inline void nccl_ofi_freelist_entry_free(nccl_ofi_freelist_t *freelist,
nccl_ofi_freelist_elem_t *entry)
{
struct nccl_ofi_freelist_elem_t *entry;
size_t user_entry_size = freelist->entry_size - MEMCHECK_REDZONE_SIZE;

assert(freelist);
assert(entry_p);
assert(entry);

nccl_net_ofi_mutex_lock(&freelist->lock);

if (freelist->have_reginfo) {
entry = (struct nccl_ofi_freelist_elem_t *)((uintptr_t)entry_p + freelist->reginfo_offset);
nccl_net_ofi_mem_defined_unaligned(entry, sizeof(*entry));
} else {
entry = (struct nccl_ofi_freelist_elem_t *)(uintptr_t)entry_p;
entry->ptr = (void *)entry;
}

entry->next = freelist->entries;
freelist->entries = entry;

nccl_net_ofi_mem_noaccess(entry_p, user_entry_size);
nccl_net_ofi_mem_noaccess(entry->ptr, user_entry_size);

nccl_net_ofi_mutex_unlock(&freelist->lock);
}
Expand Down
26 changes: 6 additions & 20 deletions include/nccl_ofi_rdma.h
Original file line number Diff line number Diff line change
Expand Up @@ -184,26 +184,9 @@ typedef struct nccl_net_ofi_rdma_close_msg {
uint32_t send_comm_id;
} nccl_net_ofi_rdma_close_msg_t;

/* Structure used to store control messages in a free list */
typedef struct nccl_net_ofi_rdma_ctrl_fl_item {
nccl_ofi_freelist_reginfo_t fl_reginfo;
union {
nccl_net_ofi_rdma_ctrl_msg_t ctrl_msg;
nccl_net_ofi_rdma_close_msg_t close_msg;
};
} nccl_net_ofi_rdma_ctrl_fl_item_t;

/* For LL/LL128 protocols, bounce buffers (source of RDMA read operations) need to be 128B aligned */
#define BOUNCE_BUFFER_ALIGNMENT 128

/* Structure used to store bounce buffers in a free list */
typedef struct nccl_net_ofi_rdma_bounce_fl_item {
nccl_ofi_freelist_reginfo_t fl_reginfo;
#define PADDING_SIZE (BOUNCE_BUFFER_ALIGNMENT - (sizeof(nccl_ofi_freelist_reginfo_t) % BOUNCE_BUFFER_ALIGNMENT))
char padding[PADDING_SIZE];
char bounce_msg[];
} nccl_net_ofi_rdma_bounce_fl_item_t;

struct nccl_net_ofi_rdma_req;
struct nccl_net_ofi_rdma_ep;
struct nccl_net_ofi_ep_rail;
Expand All @@ -213,7 +196,7 @@ typedef struct nccl_net_ofi_ep_rail nccl_net_ofi_ep_rail_t;

typedef struct {
/* Bounce buffer freelist item */
nccl_net_ofi_rdma_bounce_fl_item_t *bounce_fl_item;
nccl_ofi_freelist_elem_t *bounce_fl_elem;
/* Length of bounce buffer */
size_t buff_len;
/* Length of received data */
Expand Down Expand Up @@ -289,7 +272,7 @@ typedef struct {
*/
typedef struct {
/* Pointer to the allocated control buffer from freelist */
nccl_net_ofi_rdma_ctrl_fl_item_t *ctrl_fl_item;
nccl_ofi_freelist_elem_t *ctrl_fl_elem;
/* Schedule used to transfer the control buffer. We save the
* pointer to reference it when transferring the buffer over
* network. */
Expand All @@ -306,7 +289,7 @@ typedef struct {
*/
typedef struct {
/* Pointer to the allocated control buffer from freelist */
nccl_net_ofi_rdma_ctrl_fl_item_t *ctrl_fl_item;
nccl_ofi_freelist_elem_t *ctrl_fl_elem;
/* Schedule used to transfer the close buffer. We save the
* pointer to reference it when transferring the buffer over
* network. */
Expand Down Expand Up @@ -420,6 +403,9 @@ typedef struct nccl_net_ofi_rdma_req {
/* Type of request */
nccl_net_ofi_rdma_req_type_t type;

/* Backpointer to freelist element */
nccl_ofi_freelist_elem_t *elem;

/* Deinitialzie and free request. This function returns error
* in cases where cleanup fails. This function may also return
* error if the owner of the request has to deallocate the
Expand Down
3 changes: 3 additions & 0 deletions include/nccl_ofi_scheduler.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ typedef struct nccl_net_ofi_schedule {
/* Number of transfer information entries set by the scheduler */
size_t num_xfer_infos;

/* Backpointer to freelist element (for cleanup) */
nccl_ofi_freelist_elem_t *elem;

/* Array of transfer information structs. The array has at
* least 'num_xfer_infos' entries. */
nccl_net_ofi_xfer_info_t rail_xfer_infos[];
Expand Down
3 changes: 3 additions & 0 deletions include/nccl_ofi_sendrecv.h
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,9 @@ typedef struct nccl_net_ofi_sendrecv_req {

/* Direction of request */
nccl_net_ofi_sendrecv_req_direction_t direction;

/* Backpointer to freelist elem (for cleanup) */
nccl_ofi_freelist_elem_t *elem;
} nccl_net_ofi_sendrecv_req_t;


Expand Down
Loading

0 comments on commit 2e26a5f

Please sign in to comment.