Skip to content

Commit

Permalink
prov/psm3: update provider to sync with IEFS 11.6.0.0.231
Browse files Browse the repository at this point in the history
Updates:
- Full support for Intel oneAPI DPC++/C++ compiler
- Improved default tuning for Intel GPUs

Signed-off-by: Scott Breyer <[email protected]>
  • Loading branch information
sjb017 committed Mar 12, 2024
1 parent 9be500e commit 241f82a
Show file tree
Hide file tree
Showing 66 changed files with 5,521 additions and 1,840 deletions.
34 changes: 24 additions & 10 deletions prov/psm3/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ ACLOCAL_AMFLAGS = -I config
AM_CFLAGS = -Wall

if HAVE_LD_VERSION_SCRIPT
libpsm3_fi_version_script = -Wl,--version-script=$(builddir)/libpsm3-fi.map
libpsm3_fi_version_script = -Wl,--version-script=$(builddir)/libpsm3-fi.map
else !HAVE_LD_VERSION_SCRIPT
libpsm3_fi_version_script =
libpsm3_fi_version_script =
endif !HAVE_LD_VERSION_SCRIPT

# rdmaincludedir = $(includedir)/rdma
Expand All @@ -51,6 +51,8 @@ common_srcs = \
shared/hmem_neuron.c \
shared/hmem_synapseai.c \
shared/hmem_ipc_cache.c \
shared/xpmem.c \
shared/xpmem_cache.c \
shared/common.c \
shared/enosys.c \
shared/rbtree.c \
Expand Down Expand Up @@ -78,13 +80,22 @@ common_srcs = \
util/src/util_ns.c \
util/src/util_pep.c \
util/src/util_poll.c \
util/src/util_profile.c \
util/src/util_srx.c \
util/src/util_wait.c \
util/src/rxm_av.c \
util/src/cuda_mem_monitor.c \
util/src/cuda_ipc_monitor.c \
util/src/rocr_mem_monitor.c \
util/src/rocr_ipc_monitor.c \
util/src/ze_mem_monitor.c
util/src/ze_mem_monitor.c \
util/src/xpmem_monitor.c \
shared/fabric.c \
shared/fi_tostr.c \
shared/perf.c \
shared/log.c \
shared/var.c \
shared/abi_1_0.c

if MACOS
common_srcs += shared/osx/osd.c
Expand All @@ -103,9 +114,7 @@ if LINUX
common_srcs += shared/unix/osd.c
common_srcs += shared/linux/osd.c
if HAVE_LINUX_PERF_RDPMC
if !HAVE_PSM3_SRC
common_srcs += shared/linux/rdpmc.c #seems to be a copy of psm3/psm_perf.c
endif
common_srcs += shared/linux/rdpmc.c
endif
common_srcs += inc/linux/rdpmc.h
common_srcs += inc/linux/osd.h
Expand All @@ -120,6 +129,8 @@ bin_SCRIPTS =
nodist_src_libpsm3_fi_la_SOURCES =
src_libpsm3_fi_la_SOURCES = \
inc/ofi_hmem.h \
inc/ofi_cma.h \
inc/ofi_xpmem.h \
inc/ofi.h \
inc/ofi_abi.h \
inc/ofi_atom.h \
Expand All @@ -137,7 +148,7 @@ src_libpsm3_fi_la_SOURCES = \
inc/ofi_proto.h \
inc/ofi_recvwin.h \
inc/ofi_rbuf.h \
inc/ofi_shm.h \
inc/ofi_shm_p2p.h \
inc/ofi_signal.h \
inc/ofi_epoll.h \
inc/ofi_tree.h \
Expand All @@ -148,10 +159,12 @@ src_libpsm3_fi_la_SOURCES = \
inc/ofi_net.h \
inc/ofi_perf.h \
inc/ofi_coll.h \
inc/ofi_mb.h \
inc/fasthash.h \
inc/rbtree.h \
inc/uthash.h \
inc/ofi_prov.h \
inc/ofi_profile.h \
inc/rdma/providers/fi_log.h \
inc/rdma/providers/fi_prov.h \
inc/rdma/providers/fi_peer.h \
Expand All @@ -167,6 +180,7 @@ src_libpsm3_fi_la_SOURCES = \
inc/rdma/fi_errno.h \
inc/rdma/fi_tagged.h \
inc/rdma/fi_trigger.h \
inc/rdma/fi_profile.h \
src/psmx3.h \
src/psmx3_am.c \
src/psmx3_atomic.c \
Expand Down Expand Up @@ -216,7 +230,7 @@ src_libpsm3_fi_la_LDFLAGS += -lpsm2
endif !HAVE_PSM3_SRC

if !EMBEDDED
src_libpsm3_fi_la_LDFLAGS += -version-info 22:0:21
src_libpsm3_fi_la_LDFLAGS += -version-info 24:0:23
endif

prov_install_man_pages = man/man7/fi_psm3.7
Expand Down Expand Up @@ -249,8 +263,8 @@ src/psm3_src_chksum.h: Makefile $(chksum_srcs)

nroff:
@for file in $(prov_install_man_pages); do \
source=`echo $$file | sed -e 's@/man[0-9]@@'`; \
perl $(top_srcdir)/config/md2nroff.pl --source=$(top_srcdir)/$$source.md; \
source=`echo $$file | sed -e 's@/man[0-9]@@'`; \
perl $(top_srcdir)/config/md2nroff.pl --source=$(top_srcdir)/$$source.md; \
done

dist-hook: libpsm3-fi.spec
Expand Down
2 changes: 2 additions & 0 deletions prov/psm3/Makefile.include
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,8 @@ prov_psm3_psm3_libpsm3i_la_SOURCES = \
prov/psm3/psm3/psm_mq_recv.c \
prov/psm3/psm3/psm_mq_utils.c \
prov/psm3/psm3/psm_netutils.h \
prov/psm3/psm3/psm_nic_select.c \
prov/psm3/psm3/psm_nic_select.h \
prov/psm3/psm3/psm_oneapi_ze.c \
prov/psm3/psm3/psm_perf.c \
prov/psm3/psm3/psm_perf.h \
Expand Down
2 changes: 1 addition & 1 deletion prov/psm3/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3_5_1_1
3_6_0_0
2 changes: 1 addition & 1 deletion prov/psm3/debian/changelog
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
libpsm3-fi (11.5.1.1-1) unstable; urgency=medium
libpsm3-fi (11.6.0.0-231) unstable; urgency=medium

* Initial release

Expand Down
2 changes: 1 addition & 1 deletion prov/psm3/debian/control
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ Source: libpsm3-fi
Section: libs
Priority: optional
Maintainer: https://www.intel.com/content/www/us/en/support.html
Build-Depends: debhelper (>= 12~), uuid-dev, libnuma-dev, libibverbs-dev, librdmacm-dev
Build-Depends: debhelper (>= 12~), uuid-dev, libnuma-dev, libibverbs-dev, librdmacm-dev, libhwloc-dev
Standards-Version: 4.5.1
Rules-Requires-Root: no

Expand Down
155 changes: 124 additions & 31 deletions prov/psm3/psm3/hal_sockets/sockets_ep.c
Original file line number Diff line number Diff line change
Expand Up @@ -159,11 +159,16 @@ psm3_ep_open_udp_internal(psm2_ep_t ep, int unit, int port,
}

if (!is_aux) {
psm3_getenv("PSM3_UDP_GSO",
"Enable UDP GSO Segmentation Offload (0 disables GSO)",
PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
(union psmi_envvar_val)1, &env_gso);
ep->sockets_ep.udp_gso = env_gso.e_int;
psm3_getenv_range("PSM3_UDP_GSO",
"Enable UDP GSO Segmentation Offload",
"(0 disables GSO, 1 sets max chunk to 65536, >1 specifies max chunk)",
PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
(union psmi_envvar_val)UINT16_MAX,
(union psmi_envvar_val)0, (union psmi_envvar_val)UINT16_MAX,
NULL, NULL, &env_gso);
ep->sockets_ep.udp_gso = env_gso.e_uint;
if (ep->sockets_ep.udp_gso == 1)
ep->sockets_ep.udp_gso = UINT16_MAX;
if (ep->sockets_ep.udp_gso) {
int gso;
socklen_t optlen = sizeof(gso);
Expand Down Expand Up @@ -553,6 +558,57 @@ psm2_error_t psm3_tune_tcp_socket(const char *sck_name, psm2_ep_t ep, int fd)
return PSM2_INTERNAL_ERR;
}

/* parse TCP port range for PSM3_TCP_PORT_RANGE
* format is low:high
* low must be <= high and each must be < UINT16_MAX.
* Either field can be omitted in which case default (input tvals) is used
* for given field.
* 0 - successfully parsed, tvals updated
* -1 - str empty, tvals unchanged
* -2 - syntax error, tvals may have been changed
*/
static int parse_tcp_port_range(const char *str,
size_t errstr_size, char errstr[],
int tvals[2])
{
psmi_assert(tvals);
int ret = psm3_parse_str_tuples(str, 2, tvals);
if (ret < 0)
return ret;
if (tvals[0] > UINT16_MAX || tvals[1] > UINT16_MAX) {
if (errstr_size)
snprintf(errstr, errstr_size, " Max allowed is %u", UINT16_MAX);
return -2;
}
if (tvals[0] < 0 || tvals[1] < 0) {
if (errstr_size)
snprintf(errstr, errstr_size, " Negative values not allowed");
return -2;
}
if ((tvals[0] == TCP_PORT_AUTODETECT && tvals[1] != TCP_PORT_AUTODETECT)
|| (tvals[0] != TCP_PORT_AUTODETECT && tvals[1] == TCP_PORT_AUTODETECT)) {
if (errstr_size)
snprintf(errstr, errstr_size, " low of %d only allowed with high of %d", TCP_PORT_AUTODETECT, TCP_PORT_AUTODETECT);
return -2;
}
if (tvals[0] > tvals[1]) {
if (errstr_size)
snprintf(errstr, errstr_size, " low (%d) > high (%d)", tvals[0], tvals[1]);
return -2;
}
return 0;
}

static int parse_check_tcp_port_range(int type,
const union psmi_envvar_val val, void *ptr,
size_t errstr_size, char errstr[])
{
// parser will set tvals to result, use a copy to protect input of defaults
int tvals[2] = { ((int*)ptr)[0], ((int*)ptr)[1] };
psmi_assert(type == PSMI_ENVVAR_TYPE_STR_TUPLES);
return parse_tcp_port_range(val.e_str, errstr_size, errstr, tvals);
}

static __inline__
psm2_error_t listen_to_port(psm2_ep_t ep, int sockfd,
psm3_sockaddr_in_t *addr,
Expand All @@ -567,12 +623,16 @@ psm2_error_t listen_to_port(psm2_ep_t ep, int sockfd,
char range_def[32];
snprintf(range_def, sizeof(range_def), "%d:%d", tvals[0], tvals[1]);

if (!psm3_getenv("PSM3_TCP_PORT_RANGE",
"Set the TCP listener port range <low:high>. The listener will bind to a random port in the range. '0:0'=let OS pick.",
(void)psm3_getenv_range("PSM3_TCP_PORT_RANGE",
"Set the TCP listener port range <low:high>.",
"The listener will bind to a random port in the range. '0:0'=let OS pick.",
PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR_TUPLES,
(union psmi_envvar_val) range_def, &env_val)) {
/* not using default values */
(void)psm3_parse_str_tuples(env_val.e_str, 2, tvals);
(union psmi_envvar_val) range_def,
(union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL,
parse_check_tcp_port_range, tvals, &env_val);
if (parse_tcp_port_range(env_val.e_str, 0, NULL, tvals) < 0) {
// already checked, shouldn't get parse errors nor empty strings
psmi_assert(0);
}

_HFI_DBG("PSM3_TCP_PORT_RANGE = %d:%d\n", tvals[0], tvals[1]);
Expand All @@ -583,17 +643,14 @@ psm2_error_t listen_to_port(psm2_ep_t ep, int sockfd,
start = 0;
end = 0;
_HFI_DBG("Binding to OS provided port\n");
} else if (tvals[0] > 0 && tvals[0] <= tvals[1] && tvals[1] <= UINT16_MAX) {
} else {
psmi_assert(tvals[0] > 0);
// start with a random port, find the first available one.
port = psm3_rand((long int) getpid());
port = port % (tvals[1] + 1 - tvals[0]) + tvals[0];
start = (uint16_t)tvals[0];
end = (uint16_t)tvals[1];
_HFI_DBG("Binding to port in range [%" PRIu16 ":%" PRIu16 "], starting from %ld\n", start, end, port);
} else {
// high < low or only set one
_HFI_ERROR("Invalid TCP port range [%d:%d]\n", tvals[0], tvals[1]);
return PSM2_INTERNAL_ERR;
}

psm3_getenv("PSM3_TCP_BACKLOG",
Expand Down Expand Up @@ -637,6 +694,46 @@ psm2_error_t listen_to_port(psm2_ep_t ep, int sockfd,
return PSM2_INTERNAL_ERR;
}

/* parse TCP skip poll counts for PSM3_TCP_SKIPPOLL_COUNT
* format is inactive_polls:active_polls
* inactive_polls must be >= active_polls
* Either field can be omitted in which case default (input tvals) is used
* for given field.
* 0 - successfully parsed, tvals updated
* -1 - str empty, tvals unchanged
* -2 - syntax error, tvals may have been changed
*/
static int parse_tcp_skippoll_count(const char *str,
size_t errstr_size, char errstr[],
int tvals[2])
{
psmi_assert(tvals);
int ret = psm3_parse_str_tuples(str, 2, tvals);
if (ret < 0)
return ret;
if (tvals[0] < 0 || tvals[1] < 0) {
if (errstr_size)
snprintf(errstr, errstr_size, " Negative values not allowed");
return -2;
}
if (tvals[0] < tvals[1]) {
if (errstr_size)
snprintf(errstr, errstr_size, " inactive_polls (%d) must be >= active_polls (%d)", tvals[0], tvals[1]);
return -2;
}
return 0;
}

static int parse_check_tcp_skippoll_count(int type,
const union psmi_envvar_val val, void *ptr,
size_t errstr_size, char errstr[])
{
// parser will set tvals to result, use a copy to protect input of defaults
int tvals[2] = { ((int*)ptr)[0], ((int*)ptr)[1] };
psmi_assert(type == PSMI_ENVVAR_TYPE_STR_TUPLES);
return parse_tcp_skippoll_count(val.e_str, errstr_size, errstr, tvals);
}

psm2_error_t
psm3_ep_open_tcp_internal(psm2_ep_t ep, int unit, int port,
psm2_uuid_t const job_key)
Expand Down Expand Up @@ -772,21 +869,16 @@ psm3_ep_open_tcp_internal(psm2_ep_t ep, int unit, int port,
char buf[32];
snprintf(buf, sizeof(buf), "%d:%d", TCP_INACT_SKIP_POLLS, TCP_ACT_SKIP_POLLS);
int tvals[2] = {TCP_INACT_SKIP_POLLS, TCP_ACT_SKIP_POLLS};
if (!psm3_getenv("PSM3_TCP_SKIPPOLL_COUNT",
"Polls to skip under inactive and active connections <inactive_polls[:active_polls]> "
(void)psm3_getenv_range("PSM3_TCP_SKIPPOLL_COUNT",
"Polls to skip under inactive and active connections <inactive_polls[:active_polls]> ",
"where inactive_polls >= active_polls.",
PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR_TUPLES,
(union psmi_envvar_val) buf, &env_val)) {
(void)psm3_parse_str_tuples(env_val.e_str, 2, tvals);
if (tvals[0] < 0) {
tvals[0] = TCP_INACT_SKIP_POLLS;
}
if (tvals[1] < 0) {
tvals[1] = TCP_ACT_SKIP_POLLS;
}
if (tvals[1] > tvals[0]) {
tvals[1] = tvals[0];
}
(union psmi_envvar_val) buf,
(union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL,
parse_check_tcp_skippoll_count, tvals, &env_val);
if (parse_tcp_skippoll_count(env_val.e_str, 0, NULL, tvals) < 0) {
// already checked, shouldn't get parse errors nor empty strings
psmi_assert(0);
}
ep->sockets_ep.inactive_skip_polls = tvals[0];
ep->sockets_ep.active_skip_polls_offset = tvals[0] - tvals[1];
Expand Down Expand Up @@ -1084,10 +1176,11 @@ psm3_sockets_ips_proto_init(struct ips_proto *proto, uint32_t cksum_sz)

if (ep->sockets_ep.udp_gso) {
// set upper bounds for GSO segmentation
// OS limitation of 64K (UINT16_MAX)
// OS limitation of 64K (UINT16_MAX) and UDP_MAX_SEGMENTS (64)
ep->chunk_max_segs = min(UINT16_MAX / (ep->mtu + sizeof(struct ips_message_header)), UDP_MAX_SEGMENTS);
ep->chunk_max_size = ep->mq->hfi_base_window_rv;
// for acks to pipeline well need to limit max_nsegs to
ep->chunk_max_size = ep->sockets_ep.udp_gso;

// for acks to pipeline we'll need to limit max_nsegs to
// < flow_credits/2 and max_size to < flow_credit_bytes/2
// (ideally 1/4, but that makes GSO too small and is worse)
ep->chunk_max_segs = min(ep->chunk_max_segs, proto->flow_credits/2);
Expand Down
2 changes: 1 addition & 1 deletion prov/psm3/psm3/hal_sockets/sockets_ep.h
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ struct psm3_sockets_ep {
int active_skip_polls_offset; // tailored for internal use. it's inactive_skip_polls - active_skip_polls
struct msghdr snd_msg; // struct used for sendmsg
/* fields specific to UDP */
int udp_gso; // is GSO enabled for UDP
unsigned udp_gso; // is GSO enabled for UDP, max chunk_size
uint8_t *sbuf_udp_gso; // buffer to compose UDP GSO packet sequence
int udp_gso_zerocopy; // is UDP GSO Zero copy option enabled
int udp_gro; // will be used later
Expand Down
6 changes: 3 additions & 3 deletions prov/psm3/psm3/hal_sockets/sockets_hal.c
Original file line number Diff line number Diff line change
Expand Up @@ -175,15 +175,15 @@ static void psm3_hfp_sockets_mq_init_defaults(struct psm2_mq *mq)
* corresponding PSM3_* env variables.
* Otherwise these defaults are used.
*/
mq->hfi_thresh_rv = 64000;
mq->hfi_base_window_rv = 131072;
mq->hfi_thresh_rv = PSM_MQ_NIC_RNDV_THRESH;
mq->ips_cpu_window_rv_str = PSM_CPU_NIC_RNDV_WINDOW_STR;
// Even without RDMA do we want to disable rendezvous?
// even without RDMA, the receiver controlled pacing helps scalability
mq->hfi_thresh_rv = (~(uint32_t)0); // disable rendezvous
mq->hfi_thresh_tiny = PSM_MQ_NIC_MAX_TINY;
#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
if (PSMI_IS_GPU_ENABLED)
mq->hfi_base_window_rv = 2097152;
mq->ips_gpu_window_rv_str = PSM_GPU_NIC_RNDV_WINDOW_STR;
#endif
// we parse inet and rv_gpu_cache_size here so we can cache it
// once per EP open, even if multi-rail or multi-QP
Expand Down
Loading

0 comments on commit 241f82a

Please sign in to comment.