Skip to content

Commit

Permalink
prov/efa: Make runt_size aligned
Browse files Browse the repository at this point in the history
Currently, txe->runt_size can be non-multiple of the
memory alignment, which caused the following issues:

1. It caused the data size to be copied on the receiver side
not a multiple of memory alignment. This not only caused
non-performant data copy (gdrcopy or local read), but also
breaks the LL128 protocol for send/recv, which requires the
data size to be copied must be a multiple of 128 (the memory
alignment in this case).

2. It caused the single_pkt_entry_data_size variable in
efa_rdm_ope_prepare_to_post_send() to be 0 after doing
the alignment trim.

This patch makes the runt size always aligned before we
decide whether to use runting read protocol. If the
aligned runt size is 0, we won't do runting read.

Also added a series of unit test to validate this change

Signed-off-by: Shi Jin <[email protected]>
  • Loading branch information
shijin-aws committed Dec 6, 2023
1 parent 4b25d7b commit ca03e34
Show file tree
Hide file tree
Showing 11 changed files with 371 additions and 21 deletions.
4 changes: 3 additions & 1 deletion prov/efa/Makefile.include
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,9 @@ nodist_prov_efa_test_efa_unit_test_SOURCES = \
prov/efa/test/efa_unit_test_rnr.c \
prov/efa/test/efa_unit_test_ope.c \
prov/efa/test/efa_unit_test_send.c \
prov/efa/test/efa_unit_test_fork_support.c
prov/efa/test/efa_unit_test_fork_support.c \
prov/efa/test/efa_unit_test_runt.c


efa_CPPFLAGS += -I$(top_srcdir)/include -I$(top_srcdir)/prov/efa/test $(cmocka_CPPFLAGS)

Expand Down
5 changes: 5 additions & 0 deletions prov/efa/src/efa.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,11 @@
#define EFA_DEFAULT_INTER_MIN_READ_WRITE_SIZE (65536)
#define EFA_DEFAULT_INTRA_MAX_GDRCOPY_FROM_DEV_SIZE (3072)

/*
* The default memory alignment
*/
#define EFA_RDM_DEFAULT_MEMORY_ALIGNMENT (8)

/*
* The CUDA memory alignment
*/
Expand Down
2 changes: 2 additions & 0 deletions prov/efa/src/rdm/efa_rdm_ep.h
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,8 @@ void efa_rdm_ep_queue_rnr_pkt(struct efa_rdm_ep *ep,
ssize_t efa_rdm_ep_post_queued_pkts(struct efa_rdm_ep *ep,
struct dlist_entry *pkts);

size_t efa_rdm_ep_get_memory_alignment(struct efa_rdm_ep *ep, enum fi_hmem_iface iface);

static inline
struct efa_domain *efa_rdm_ep_domain(struct efa_rdm_ep *ep)
{
Expand Down
20 changes: 20 additions & 0 deletions prov/efa/src/rdm/efa_rdm_ep_utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -673,3 +673,23 @@ void efa_rdm_ep_post_handshake_or_queue(struct efa_rdm_ep *ep, struct efa_rdm_pe

peer->flags |= EFA_RDM_PEER_HANDSHAKE_SENT;
}

/**
* @brief Get memory alignment for given ep and hmem iface
*
* @param ep efa rdm ep
* @param iface hmem iface
* @return size_t the memory alignment
*/
size_t efa_rdm_ep_get_memory_alignment(struct efa_rdm_ep *ep, enum fi_hmem_iface iface)
{
size_t memory_alignment = EFA_RDM_DEFAULT_MEMORY_ALIGNMENT;

if (ep->sendrecv_in_order_aligned_128_bytes) {
memory_alignment = EFA_RDM_IN_ORDER_ALIGNMENT;
} else if (iface == FI_HMEM_CUDA) {
memory_alignment = EFA_RDM_CUDA_MEMORY_ALIGNMENT;
}

return memory_alignment;
}
2 changes: 1 addition & 1 deletion prov/efa/src/rdm/efa_rdm_msg.c
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ int efa_rdm_msg_select_rtm(struct efa_rdm_ep *efa_rdm_ep, struct efa_rdm_ope *tx

eager_rtm_max_data_size = efa_rdm_txe_max_req_data_capacity(efa_rdm_ep, txe, eager_rtm);

readbase_rtm = efa_rdm_peer_select_readbase_rtm(peer, txe->op, txe->fi_flags, &hmem_info[iface]);
readbase_rtm = efa_rdm_peer_select_readbase_rtm(peer, efa_rdm_ep, txe);

if (txe->total_len >= hmem_info[iface].min_read_msg_size &&
efa_rdm_interop_rdma_read(efa_rdm_ep, peer) &&
Expand Down
20 changes: 8 additions & 12 deletions prov/efa/src/rdm/efa_rdm_ope.c
Original file line number Diff line number Diff line change
Expand Up @@ -354,8 +354,6 @@ int efa_rdm_txe_prepare_to_be_read(struct efa_rdm_ope *txe, struct fi_rma_iov *r
static inline
void efa_rdm_txe_set_runt_size(struct efa_rdm_ep *ep, struct efa_rdm_ope *txe)
{
int iface;
struct efa_hmem_info *hmem_info;
struct efa_rdm_peer *peer;

assert(txe->type == EFA_RDM_TXE);
Expand All @@ -365,11 +363,10 @@ void efa_rdm_txe_set_runt_size(struct efa_rdm_ep *ep, struct efa_rdm_ope *txe)

peer = efa_rdm_ep_get_peer(ep, txe->addr);

iface = txe->desc[0] ? ((struct efa_mr*) txe->desc[0])->peer.iface : FI_HMEM_SYSTEM;
hmem_info = &efa_rdm_ep_domain(ep)->hmem_info[iface];

assert(peer);
txe->bytes_runt = MIN(hmem_info->runt_size - peer->num_runt_bytes_in_flight, txe->total_len);
txe->bytes_runt = efa_rdm_peer_get_runt_size(peer, ep, txe);

assert(txe->bytes_runt);
}

/**
Expand Down Expand Up @@ -481,7 +478,7 @@ ssize_t efa_rdm_ope_prepare_to_post_send(struct efa_rdm_ope *ope,
size_t total_pkt_entry_data_size; /* total number of bytes send via packet entry's payload */
size_t single_pkt_entry_data_size;
size_t single_pkt_entry_max_data_size;
int i, memory_alignment = 8, remainder;
int i, memory_alignment, remainder, iface;
int available_tx_pkts;

ep = ope->ep;
Expand Down Expand Up @@ -520,11 +517,8 @@ ssize_t efa_rdm_ope_prepare_to_post_send(struct efa_rdm_ope *ope,
single_pkt_entry_max_data_size = efa_rdm_txe_max_req_data_capacity(ep, ope, pkt_type);
assert(single_pkt_entry_max_data_size);

if (ep->sendrecv_in_order_aligned_128_bytes) {
memory_alignment = EFA_RDM_IN_ORDER_ALIGNMENT;
} else if (efa_mr_is_cuda(ope->desc[0])) {
memory_alignment = EFA_RDM_CUDA_MEMORY_ALIGNMENT;
}
iface = ope->desc[0] ? ((struct efa_mr*) ope->desc[0])->peer.iface : FI_HMEM_SYSTEM;
memory_alignment = efa_rdm_ep_get_memory_alignment(ep, iface);

*pkt_entry_cnt = (total_pkt_entry_data_size - 1) / single_pkt_entry_max_data_size + 1;
if (*pkt_entry_cnt > available_tx_pkts)
Expand All @@ -537,6 +531,8 @@ ssize_t efa_rdm_ope_prepare_to_post_send(struct efa_rdm_ope *ope,

/* each packet must be aligned */
single_pkt_entry_data_size = single_pkt_entry_data_size & ~(memory_alignment - 1);
assert(single_pkt_entry_data_size);

*pkt_entry_cnt = total_pkt_entry_data_size / single_pkt_entry_data_size;
for (i = 0; i < *pkt_entry_cnt; ++i)
pkt_entry_data_size_vec[i] = single_pkt_entry_data_size;
Expand Down
48 changes: 42 additions & 6 deletions prov/efa/src/rdm/efa_rdm_peer.c
Original file line number Diff line number Diff line change
Expand Up @@ -254,21 +254,57 @@ void efa_rdm_peer_proc_pending_items_in_robuf(struct efa_rdm_peer *peer, struct
return;
}

/**
* @brief Get the runt size for a given peer and ope
*
* @param peer rdm peer
* @param ep efa rdm ep
* @param ope efa rdm ope
* @return size_t the number of bytes that can be runt
*/
size_t efa_rdm_peer_get_runt_size(struct efa_rdm_peer *peer,
struct efa_rdm_ep *ep, struct efa_rdm_ope *ope)
{
struct efa_hmem_info *hmem_info;
size_t runt_size;
size_t memory_alignment;
int iface;

hmem_info = efa_rdm_ep_domain(ep)->hmem_info;
iface = ope->desc[0] ? ((struct efa_mr*) ope->desc[0])->peer.iface : FI_HMEM_SYSTEM;

if (hmem_info[iface].runt_size < peer->num_runt_bytes_in_flight)
return 0;

runt_size = MIN(hmem_info[iface].runt_size - peer->num_runt_bytes_in_flight, ope->total_len);
memory_alignment = efa_rdm_ep_get_memory_alignment(ep, iface);
/*
* runt size must be aligned because:
* 1. For LL128 protocol, the size to be copied on the receiver side must be 128-multiple,
* 128 is the alignment in this case.
* 2. For non-LL128 protocol, using aligned runt size has optimal performance for data copy.
* Note the returned value can be 0. In that case we will not use runting read protocol.
*/
return (runt_size & ~(memory_alignment - 1));
}

/**
* @brief Determine which Read based protocol to use for a given peer
*
* @param[in] peer rdm peer
* @param[in] op operation type
* @param[in] flags the flags that the application used to call fi_* functions
* @param[in] hmem_info configured protocol limits
* @param[in] ep efa rdm ep
* @param[in] efa_rdm_ope efa rdm ope
* @return The read-based protocol to use based on inputs.
*/
int efa_rdm_peer_select_readbase_rtm(struct efa_rdm_peer *peer, int op, uint64_t fi_flags, struct efa_hmem_info *hmem_info)
int efa_rdm_peer_select_readbase_rtm(struct efa_rdm_peer *peer,
struct efa_rdm_ep *ep, struct efa_rdm_ope *ope)
{
int op = ope->op;

assert(op == ofi_op_tagged || op == ofi_op_msg);
if (peer->num_read_msg_in_flight == 0 &&
hmem_info->runt_size > peer->num_runt_bytes_in_flight &&
!(fi_flags & FI_DELIVERY_COMPLETE)) {
efa_rdm_peer_get_runt_size(peer, ep, ope) > 0 &&
!(ope->fi_flags & FI_DELIVERY_COMPLETE)) {
return (op == ofi_op_tagged) ? EFA_RDM_RUNTREAD_TAGRTM_PKT
: EFA_RDM_RUNTREAD_MSGRTM_PKT;
} else {
Expand Down
4 changes: 3 additions & 1 deletion prov/efa/src/rdm/efa_rdm_peer.h
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,8 @@ int efa_rdm_peer_reorder_msg(struct efa_rdm_peer *peer, struct efa_rdm_ep *ep, s

void efa_rdm_peer_proc_pending_items_in_robuf(struct efa_rdm_peer *peer, struct efa_rdm_ep *ep);

int efa_rdm_peer_select_readbase_rtm(struct efa_rdm_peer *peer, int op, uint64_t fi_flags, struct efa_hmem_info *hmem_info);
size_t efa_rdm_peer_get_runt_size(struct efa_rdm_peer *peer, struct efa_rdm_ep *ep, struct efa_rdm_ope *ope);

int efa_rdm_peer_select_readbase_rtm(struct efa_rdm_peer *peer, struct efa_rdm_ep *ep, struct efa_rdm_ope *ope);

#endif /* EFA_RDM_PEER_H */
Loading

0 comments on commit ca03e34

Please sign in to comment.