From 7cdbfbd4a2786672eecdf50c01e88a625e2fd8b7 Mon Sep 17 00:00:00 2001 From: Adam Goldman Date: Tue, 5 Jul 2022 14:09:42 -0400 Subject: [PATCH] PSM3 OFI Provider from IEFS 11_3_0_0_130 Ensure 11.3 release matches SRPM found in IEFS release Signed-off-by: Adam Goldman --- Makefile.in | 263 +--- inc/ofi_cuda.h | 111 -- libpsm3-fi.spec | 2 +- man/man7/fi_psm3.7 | 11 +- psm3/Makefile.include | 36 - psm3/hal_gen1/gen1_common.h | 64 - psm3/hal_gen1/gen1_gdrcpy.c | 236 ---- psm3/hal_gen1/gen1_hal.c | 367 ----- psm3/hal_gen1/gen1_hal.h | 620 --------- psm3/hal_gen1/gen1_hal_inline_i.h | 1653 ----------------------- psm3/hal_gen1/gen1_hfi1_deprecated.h | 183 --- psm3/hal_gen1/gen1_i2cflash.c | 89 -- psm3/hal_gen1/gen1_proto.c | 540 -------- psm3/hal_gen1/gen1_ptl_ips.c | 1634 ---------------------- psm3/hal_gen1/gen1_ptl_ips_expected.c | 89 -- psm3/hal_gen1/gen1_ptl_ips_subcontext.h | 81 -- psm3/hal_gen1/gen1_ptl_ips_writehdrq.h | 84 -- psm3/hal_gen1/gen1_rcvthread.c | 193 --- psm3/hal_gen1/gen1_recvhdrq.c | 755 ----------- psm3/hal_gen1/gen1_sdma.c | 893 ------------ psm3/hal_gen1/gen1_sdma.h | 76 -- psm3/hal_gen1/gen1_service.c | 972 ------------- psm3/hal_gen1/gen1_service.h | 256 ---- psm3/hal_gen1/gen1_spio.c | 998 -------------- psm3/hal_gen1/gen1_spio.h | 155 --- psm3/hal_gen1/gen1_types.h | 244 ---- psm3/hal_gen1/gen1_user.h | 672 --------- psm3/hal_gen1/gen1_utils.c | 401 ------ psm3/hal_verbs/verbs_ep.c | 28 - psm3/hal_verbs/verbs_ep.h | 4 - psm3/hal_verbs/verbs_hal_inline_i.h | 6 - psm3/include/utils_debug.h | 24 - psm3/include/utils_sysfs.h | 12 - psm3/include/utils_user.h | 56 - psm3/psm.c | 10 - psm3/psm2_hal.c | 30 - psm3/psm2_hal.h | 241 ---- psm3/psm2_hal_inline_t.h | 76 -- psm3/psm_config.h | 8 - psm3/psm_context.c | 54 - psm3/psm_context.h | 26 - psm3/psm_ep.c | 47 - psm3/psm_ep.h | 28 - psm3/psm_ep_connect.c | 9 - psm3/psm_error.c | 4 - psm3/psm_mq.c | 14 - psm3/psm_mq_recv.c | 30 +- psm3/psm_netutils.h | 25 - psm3/psm_stats.c | 172 +-- psm3/psm_stats.h | 5 - psm3/psm_user.h | 11 - psm3/psm_utils.c | 489 ------- psm3/psm_utils.h | 30 - psm3/ptl_ips/ips_config.h | 27 - psm3/ptl_ips/ips_expected_proto.h | 150 -- psm3/ptl_ips/ips_opp_path_rec.c | 40 - psm3/ptl_ips/ips_path_rec.c | 66 - psm3/ptl_ips/ips_path_rec.h | 22 - psm3/ptl_ips/ips_proto.c | 559 +------- psm3/ptl_ips/ips_proto.h | 180 +-- psm3/ptl_ips/ips_proto_am.c | 8 - psm3/ptl_ips/ips_proto_connect.c | 101 -- psm3/ptl_ips/ips_proto_connect.h | 10 - psm3/ptl_ips/ips_proto_dump.c | 22 - psm3/ptl_ips/ips_proto_expected.c | 1344 +----------------- psm3/ptl_ips/ips_proto_header.h | 20 - psm3/ptl_ips/ips_proto_help.h | 134 -- psm3/ptl_ips/ips_proto_internal.h | 6 - psm3/ptl_ips/ips_proto_mq.c | 194 +-- psm3/ptl_ips/ips_proto_params.h | 26 - psm3/ptl_ips/ips_proto_recv.c | 329 ----- psm3/ptl_ips/ips_recvhdrq.h | 42 - psm3/ptl_ips/ips_scb.c | 9 - psm3/ptl_ips/ips_scb.h | 33 - psm3/ptl_ips/ips_tid.c | 226 ---- psm3/ptl_ips/ips_tid.h | 95 -- psm3/ptl_ips/ips_tidcache.c | 632 --------- psm3/ptl_ips/ips_tidflow.c | 105 -- psm3/ptl_ips/ips_tidflow.h | 11 - psm3/ptl_ips/ptl.c | 4 - psm3/utils/utils_dwordcpy-x86_64.c | 127 -- psm3/utils/utils_sysfs.c | 174 --- shared/abi_1_0.c | 453 ------- shared/fabric.c | 1406 ------------------- shared/fi_tostr.c | 894 ------------ shared/hmem_synapseai.c | 101 -- shared/log.c | 193 --- shared/perf.c | 150 -- shared/var.c | 337 ----- 89 files changed, 27 insertions(+), 21320 deletions(-) delete mode 100644 inc/ofi_cuda.h delete mode 100644 psm3/hal_gen1/gen1_common.h delete mode 100644 psm3/hal_gen1/gen1_gdrcpy.c delete mode 100644 psm3/hal_gen1/gen1_hal.c delete mode 100644 psm3/hal_gen1/gen1_hal.h delete mode 100644 psm3/hal_gen1/gen1_hal_inline_i.h delete mode 100644 psm3/hal_gen1/gen1_hfi1_deprecated.h delete mode 100644 psm3/hal_gen1/gen1_i2cflash.c delete mode 100644 psm3/hal_gen1/gen1_proto.c delete mode 100644 psm3/hal_gen1/gen1_ptl_ips.c delete mode 100644 psm3/hal_gen1/gen1_ptl_ips_expected.c delete mode 100644 psm3/hal_gen1/gen1_ptl_ips_subcontext.h delete mode 100644 psm3/hal_gen1/gen1_ptl_ips_writehdrq.h delete mode 100644 psm3/hal_gen1/gen1_rcvthread.c delete mode 100644 psm3/hal_gen1/gen1_recvhdrq.c delete mode 100644 psm3/hal_gen1/gen1_sdma.c delete mode 100644 psm3/hal_gen1/gen1_sdma.h delete mode 100644 psm3/hal_gen1/gen1_service.c delete mode 100644 psm3/hal_gen1/gen1_service.h delete mode 100644 psm3/hal_gen1/gen1_spio.c delete mode 100644 psm3/hal_gen1/gen1_spio.h delete mode 100644 psm3/hal_gen1/gen1_types.h delete mode 100644 psm3/hal_gen1/gen1_user.h delete mode 100644 psm3/hal_gen1/gen1_utils.c delete mode 100644 shared/abi_1_0.c delete mode 100644 shared/fabric.c delete mode 100644 shared/fi_tostr.c delete mode 100644 shared/hmem_synapseai.c delete mode 100644 shared/log.c delete mode 100644 shared/perf.c delete mode 100644 shared/var.c diff --git a/Makefile.in b/Makefile.in index 70b2e52..fe525e2 100644 --- a/Makefile.in +++ b/Makefile.in @@ -109,7 +109,6 @@ host_triplet = @host@ @HAVE_PSM3_SRC_TRUE@ psm3/libptl_am.la \ @HAVE_PSM3_SRC_TRUE@ psm3/libptl_ips.la \ @HAVE_PSM3_SRC_TRUE@ psm3/libptl_self.la \ -@HAVE_PSM3_SRC_TRUE@ psm3/libhal_gen1.la \ @HAVE_PSM3_SRC_TRUE@ psm3/libhal_verbs.la \ @HAVE_PSM3_SRC_TRUE@ psm3/libhal_sockets.la \ @HAVE_PSM3_SRC_TRUE@ psm3/libpsm3i.la @@ -120,7 +119,6 @@ host_triplet = @host@ @HAVE_PSM3_SRC_TRUE@ $(psm3_libptl_ips_la_SOURCES) \ @HAVE_PSM3_SRC_TRUE@ $(psm3_libptl_self_la_SOURCES) \ @HAVE_PSM3_SRC_TRUE@ $(psm3_libutils_la_SOURCES) \ -@HAVE_PSM3_SRC_TRUE@ $(psm3_libhal_gen1_la_SOURCES) \ @HAVE_PSM3_SRC_TRUE@ $(psm3_libhal_verbs_la_SOURCES) \ @HAVE_PSM3_SRC_TRUE@ $(psm3_libhal_sockets_la_SOURCES) \ @HAVE_PSM3_SRC_TRUE@ $(psm3_libpsm3i_la_SOURCES) \ @@ -186,44 +184,6 @@ am__installdirs = "$(DESTDIR)$(libfabric_pkglibdir)" \ "$(DESTDIR)$(bindir)" "$(DESTDIR)$(man7dir)" \ "$(DESTDIR)$(pkgconfigdir)" LTLIBRARIES = $(libfabric_pkglib_LTLIBRARIES) $(noinst_LTLIBRARIES) -psm3_libhal_gen1_la_LIBADD = -am__psm3_libhal_gen1_la_SOURCES_DIST = psm3/hal_gen1/gen1_types.h \ - psm3/hal_gen1/gen1_hfi1_deprecated.h \ - psm3/hal_gen1/gen1_common.h psm3/hal_gen1/gen1_i2cflash.c \ - psm3/hal_gen1/gen1_proto.c psm3/hal_gen1/gen1_sdma.c \ - psm3/hal_gen1/gen1_sdma.h psm3/hal_gen1/gen1_service.c \ - psm3/hal_gen1/gen1_service.h psm3/hal_gen1/gen1_user.h \ - psm3/hal_gen1/gen1_utils.c psm3/hal_gen1/gen1_gdrcpy.c \ - psm3/hal_gen1/gen1_hal.c psm3/hal_gen1/gen1_hal.h \ - psm3/hal_gen1/gen1_hal_inline_i.h \ - psm3/hal_gen1/gen1_ptl_ips_subcontext.h \ - psm3/hal_gen1/gen1_ptl_ips_writehdrq.h \ - psm3/hal_gen1/gen1_ptl_ips.c \ - psm3/hal_gen1/gen1_ptl_ips_expected.c \ - psm3/hal_gen1/gen1_rcvthread.c psm3/hal_gen1/gen1_recvhdrq.c \ - psm3/hal_gen1/gen1_spio.h -am__dirstamp = $(am__leading_dot)dirstamp -@HAVE_PSM3_SRC_TRUE@am_psm3_libhal_gen1_la_OBJECTS = psm3/hal_gen1/libhal_gen1_la-gen1_i2cflash.lo \ -@HAVE_PSM3_SRC_TRUE@ psm3/hal_gen1/libhal_gen1_la-gen1_proto.lo \ -@HAVE_PSM3_SRC_TRUE@ psm3/hal_gen1/libhal_gen1_la-gen1_sdma.lo \ -@HAVE_PSM3_SRC_TRUE@ psm3/hal_gen1/libhal_gen1_la-gen1_service.lo \ -@HAVE_PSM3_SRC_TRUE@ psm3/hal_gen1/libhal_gen1_la-gen1_utils.lo \ -@HAVE_PSM3_SRC_TRUE@ psm3/hal_gen1/libhal_gen1_la-gen1_gdrcpy.lo \ -@HAVE_PSM3_SRC_TRUE@ psm3/hal_gen1/libhal_gen1_la-gen1_hal.lo \ -@HAVE_PSM3_SRC_TRUE@ psm3/hal_gen1/libhal_gen1_la-gen1_ptl_ips.lo \ -@HAVE_PSM3_SRC_TRUE@ psm3/hal_gen1/libhal_gen1_la-gen1_ptl_ips_expected.lo \ -@HAVE_PSM3_SRC_TRUE@ psm3/hal_gen1/libhal_gen1_la-gen1_rcvthread.lo \ -@HAVE_PSM3_SRC_TRUE@ psm3/hal_gen1/libhal_gen1_la-gen1_recvhdrq.lo -psm3_libhal_gen1_la_OBJECTS = $(am_psm3_libhal_gen1_la_OBJECTS) -AM_V_lt = $(am__v_lt_@AM_V@) -am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@) -am__v_lt_0 = --silent -am__v_lt_1 = -psm3_libhal_gen1_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \ - $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) \ - $(psm3_libhal_gen1_la_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \ - $(LDFLAGS) -o $@ -@HAVE_PSM3_SRC_TRUE@am_psm3_libhal_gen1_la_rpath = psm3_libhal_sockets_la_LIBADD = am__psm3_libhal_sockets_la_SOURCES_DIST = \ psm3/hal_sockets/sockets_ep.c psm3/hal_sockets/sockets_ep.h \ @@ -237,6 +197,7 @@ am__psm3_libhal_sockets_la_SOURCES_DIST = \ psm3/hal_sockets/sockets_ptl_ips.c \ psm3/hal_sockets/sockets_rcvthread.c \ psm3/hal_sockets/sockets_recvhdrq.c +am__dirstamp = $(am__leading_dot)dirstamp @HAVE_PSM3_SRC_TRUE@am_psm3_libhal_sockets_la_OBJECTS = psm3/hal_sockets/libhal_sockets_la-sockets_ep.lo \ @HAVE_PSM3_SRC_TRUE@ psm3/hal_sockets/libhal_sockets_la-sockets_service.lo \ @HAVE_PSM3_SRC_TRUE@ psm3/hal_sockets/libhal_sockets_la-sockets_gdrcpy.lo \ @@ -246,6 +207,10 @@ am__psm3_libhal_sockets_la_SOURCES_DIST = \ @HAVE_PSM3_SRC_TRUE@ psm3/hal_sockets/libhal_sockets_la-sockets_rcvthread.lo \ @HAVE_PSM3_SRC_TRUE@ psm3/hal_sockets/libhal_sockets_la-sockets_recvhdrq.lo psm3_libhal_sockets_la_OBJECTS = $(am_psm3_libhal_sockets_la_OBJECTS) +AM_V_lt = $(am__v_lt_@AM_V@) +am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@) +am__v_lt_0 = --silent +am__v_lt_1 = psm3_libhal_sockets_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \ $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) \ $(psm3_libhal_sockets_la_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \ @@ -579,17 +544,6 @@ am__depfiles_remade = psm3/$(DEPDIR)/libpsm3i_la-psm.Plo \ psm3/$(DEPDIR)/libpsm3i_la-psm_verbs_mr.Plo \ psm3/$(DEPDIR)/libpsm3i_la-psm_verbs_umrc.Plo \ psm3/$(DEPDIR)/libpsm3i_la-psmi_wrappers.Plo \ - psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_gdrcpy.Plo \ - psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_hal.Plo \ - psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_i2cflash.Plo \ - psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_proto.Plo \ - psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_ptl_ips.Plo \ - psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_ptl_ips_expected.Plo \ - psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_rcvthread.Plo \ - psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_recvhdrq.Plo \ - psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_sdma.Plo \ - psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_service.Plo \ - psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_utils.Plo \ psm3/hal_sockets/$(DEPDIR)/libhal_sockets_la-sockets_ep.Plo \ psm3/hal_sockets/$(DEPDIR)/libhal_sockets_la-sockets_gdrcpy.Plo \ psm3/hal_sockets/$(DEPDIR)/libhal_sockets_la-sockets_hal.Plo \ @@ -716,16 +670,14 @@ AM_V_CCLD = $(am__v_CCLD_@AM_V@) am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@) am__v_CCLD_0 = @echo " CCLD " $@; am__v_CCLD_1 = -SOURCES = $(psm3_libhal_gen1_la_SOURCES) \ - $(psm3_libhal_sockets_la_SOURCES) \ +SOURCES = $(psm3_libhal_sockets_la_SOURCES) \ $(psm3_libhal_verbs_la_SOURCES) $(psm3_libpsm3i_la_SOURCES) \ $(nodist_psm3_libpsm3i_la_SOURCES) \ $(psm3_libptl_am_la_SOURCES) $(psm3_libptl_ips_la_SOURCES) \ $(psm3_libptl_self_la_SOURCES) $(psm3_libutils_la_SOURCES) \ $(src_libpsm3_fi_la_SOURCES) \ $(nodist_src_libpsm3_fi_la_SOURCES) -DIST_SOURCES = $(am__psm3_libhal_gen1_la_SOURCES_DIST) \ - $(am__psm3_libhal_sockets_la_SOURCES_DIST) \ +DIST_SOURCES = $(am__psm3_libhal_sockets_la_SOURCES_DIST) \ $(am__psm3_libhal_verbs_la_SOURCES_DIST) \ $(am__psm3_libpsm3i_la_SOURCES_DIST) \ $(am__psm3_libptl_am_la_SOURCES_DIST) \ @@ -1175,41 +1127,6 @@ chksum_srcs = $(src_libpsm3_fi_la_SOURCES) $(am__append_11) \ @HAVE_PSM3_SRC_TRUE@psm3_libutils_la_CFLAGS = \ @HAVE_PSM3_SRC_TRUE@ $(AM_CFLAGS) $(psm3_CFLAGS) $(_psm3_cflags) - -#ifdef PSM_OPA -@HAVE_PSM3_SRC_TRUE@psm3_libhal_gen1_la_SOURCES = \ -@HAVE_PSM3_SRC_TRUE@ psm3/hal_gen1/gen1_types.h \ -@HAVE_PSM3_SRC_TRUE@ psm3/hal_gen1/gen1_hfi1_deprecated.h \ -@HAVE_PSM3_SRC_TRUE@ psm3/hal_gen1/gen1_common.h \ -@HAVE_PSM3_SRC_TRUE@ psm3/hal_gen1/gen1_i2cflash.c \ -@HAVE_PSM3_SRC_TRUE@ psm3/hal_gen1/gen1_proto.c \ -@HAVE_PSM3_SRC_TRUE@ psm3/hal_gen1/gen1_sdma.c \ -@HAVE_PSM3_SRC_TRUE@ psm3/hal_gen1/gen1_sdma.h \ -@HAVE_PSM3_SRC_TRUE@ psm3/hal_gen1/gen1_service.c \ -@HAVE_PSM3_SRC_TRUE@ psm3/hal_gen1/gen1_service.h \ -@HAVE_PSM3_SRC_TRUE@ psm3/hal_gen1/gen1_user.h \ -@HAVE_PSM3_SRC_TRUE@ psm3/hal_gen1/gen1_utils.c \ -@HAVE_PSM3_SRC_TRUE@ psm3/hal_gen1/gen1_gdrcpy.c \ -@HAVE_PSM3_SRC_TRUE@ psm3/hal_gen1/gen1_hal.c \ -@HAVE_PSM3_SRC_TRUE@ psm3/hal_gen1/gen1_hal.h \ -@HAVE_PSM3_SRC_TRUE@ psm3/hal_gen1/gen1_hal_inline_i.h \ -@HAVE_PSM3_SRC_TRUE@ psm3/hal_gen1/gen1_ptl_ips_subcontext.h \ -@HAVE_PSM3_SRC_TRUE@ psm3/hal_gen1/gen1_ptl_ips_writehdrq.h \ -@HAVE_PSM3_SRC_TRUE@ psm3/hal_gen1/gen1_ptl_ips.c \ -@HAVE_PSM3_SRC_TRUE@ psm3/hal_gen1/gen1_ptl_ips_expected.c \ -@HAVE_PSM3_SRC_TRUE@ psm3/hal_gen1/gen1_rcvthread.c \ -@HAVE_PSM3_SRC_TRUE@ psm3/hal_gen1/gen1_recvhdrq.c \ -@HAVE_PSM3_SRC_TRUE@ psm3/hal_gen1/gen1_spio.h - -@HAVE_PSM3_SRC_TRUE@psm3_libhal_gen1_la_CPPFLAGS = \ -@HAVE_PSM3_SRC_TRUE@ -I$(top_srcdir)/psm3/hal_gen1/ \ -@HAVE_PSM3_SRC_TRUE@ $(AM_CPPFLAGS) $(psm3_CPPFLAGS) $(_psm3_cppflags) - -@HAVE_PSM3_SRC_TRUE@psm3_libhal_gen1_la_CFLAGS = \ -@HAVE_PSM3_SRC_TRUE@ $(AM_CFLAGS) $(psm3_CFLAGS) $(_psm3_cflags) - - -#endif PSM_OPA @HAVE_PSM3_SRC_TRUE@psm3_libhal_verbs_la_SOURCES = \ @HAVE_PSM3_SRC_TRUE@ psm3/hal_verbs/verbs_ep.c \ @HAVE_PSM3_SRC_TRUE@ psm3/hal_verbs/verbs_ep.h \ @@ -1324,7 +1241,6 @@ chksum_srcs = $(src_libpsm3_fi_la_SOURCES) $(am__append_11) \ @HAVE_PSM3_SRC_TRUE@ psm3/libptl_am.la \ @HAVE_PSM3_SRC_TRUE@ psm3/libptl_ips.la \ @HAVE_PSM3_SRC_TRUE@ psm3/libptl_self.la \ -@HAVE_PSM3_SRC_TRUE@ psm3/libhal_gen1.la \ @HAVE_PSM3_SRC_TRUE@ psm3/libhal_verbs.la \ @HAVE_PSM3_SRC_TRUE@ psm3/libhal_sockets.la @@ -1333,13 +1249,11 @@ chksum_srcs = $(src_libpsm3_fi_la_SOURCES) $(am__append_11) \ @HAVE_PSM3_SRC_TRUE@ psm3/libptl_am.la \ @HAVE_PSM3_SRC_TRUE@ psm3/libptl_ips.la \ @HAVE_PSM3_SRC_TRUE@ psm3/libptl_self.la \ -@HAVE_PSM3_SRC_TRUE@ psm3/libhal_gen1.la \ @HAVE_PSM3_SRC_TRUE@ psm3/libhal_verbs.la \ @HAVE_PSM3_SRC_TRUE@ psm3/libhal_sockets.la @HAVE_PSM3_SRC_TRUE@_psm3_extra_dist = \ @HAVE_PSM3_SRC_TRUE@ psm3/include/psm3_rbtree.c \ -@HAVE_PSM3_SRC_TRUE@ psm3/hal_gen1/gen1_spio.c \ @HAVE_PSM3_SRC_TRUE@ psm3/hal_verbs/verbs_spio.c \ @HAVE_PSM3_SRC_TRUE@ psm3/hal_sockets/sockets_spio.c \ @HAVE_PSM3_SRC_TRUE@ psm3/utils/utils_dwordcpy-x86_64-fast.S @@ -1464,51 +1378,6 @@ clean-noinstLTLIBRARIES: echo rm -f $${locs}; \ rm -f $${locs}; \ } -psm3/hal_gen1/$(am__dirstamp): - @$(MKDIR_P) psm3/hal_gen1 - @: > psm3/hal_gen1/$(am__dirstamp) -psm3/hal_gen1/$(DEPDIR)/$(am__dirstamp): - @$(MKDIR_P) psm3/hal_gen1/$(DEPDIR) - @: > psm3/hal_gen1/$(DEPDIR)/$(am__dirstamp) -psm3/hal_gen1/libhal_gen1_la-gen1_i2cflash.lo: \ - psm3/hal_gen1/$(am__dirstamp) \ - psm3/hal_gen1/$(DEPDIR)/$(am__dirstamp) -psm3/hal_gen1/libhal_gen1_la-gen1_proto.lo: \ - psm3/hal_gen1/$(am__dirstamp) \ - psm3/hal_gen1/$(DEPDIR)/$(am__dirstamp) -psm3/hal_gen1/libhal_gen1_la-gen1_sdma.lo: \ - psm3/hal_gen1/$(am__dirstamp) \ - psm3/hal_gen1/$(DEPDIR)/$(am__dirstamp) -psm3/hal_gen1/libhal_gen1_la-gen1_service.lo: \ - psm3/hal_gen1/$(am__dirstamp) \ - psm3/hal_gen1/$(DEPDIR)/$(am__dirstamp) -psm3/hal_gen1/libhal_gen1_la-gen1_utils.lo: \ - psm3/hal_gen1/$(am__dirstamp) \ - psm3/hal_gen1/$(DEPDIR)/$(am__dirstamp) -psm3/hal_gen1/libhal_gen1_la-gen1_gdrcpy.lo: \ - psm3/hal_gen1/$(am__dirstamp) \ - psm3/hal_gen1/$(DEPDIR)/$(am__dirstamp) -psm3/hal_gen1/libhal_gen1_la-gen1_hal.lo: \ - psm3/hal_gen1/$(am__dirstamp) \ - psm3/hal_gen1/$(DEPDIR)/$(am__dirstamp) -psm3/hal_gen1/libhal_gen1_la-gen1_ptl_ips.lo: \ - psm3/hal_gen1/$(am__dirstamp) \ - psm3/hal_gen1/$(DEPDIR)/$(am__dirstamp) -psm3/hal_gen1/libhal_gen1_la-gen1_ptl_ips_expected.lo: \ - psm3/hal_gen1/$(am__dirstamp) \ - psm3/hal_gen1/$(DEPDIR)/$(am__dirstamp) -psm3/hal_gen1/libhal_gen1_la-gen1_rcvthread.lo: \ - psm3/hal_gen1/$(am__dirstamp) \ - psm3/hal_gen1/$(DEPDIR)/$(am__dirstamp) -psm3/hal_gen1/libhal_gen1_la-gen1_recvhdrq.lo: \ - psm3/hal_gen1/$(am__dirstamp) \ - psm3/hal_gen1/$(DEPDIR)/$(am__dirstamp) -psm3/$(am__dirstamp): - @$(MKDIR_P) psm3 - @: > psm3/$(am__dirstamp) - -psm3/libhal_gen1.la: $(psm3_libhal_gen1_la_OBJECTS) $(psm3_libhal_gen1_la_DEPENDENCIES) $(EXTRA_psm3_libhal_gen1_la_DEPENDENCIES) psm3/$(am__dirstamp) - $(AM_V_CCLD)$(psm3_libhal_gen1_la_LINK) $(am_psm3_libhal_gen1_la_rpath) $(psm3_libhal_gen1_la_OBJECTS) $(psm3_libhal_gen1_la_LIBADD) $(LIBS) psm3/hal_sockets/$(am__dirstamp): @$(MKDIR_P) psm3/hal_sockets @: > psm3/hal_sockets/$(am__dirstamp) @@ -1539,6 +1408,9 @@ psm3/hal_sockets/libhal_sockets_la-sockets_rcvthread.lo: \ psm3/hal_sockets/libhal_sockets_la-sockets_recvhdrq.lo: \ psm3/hal_sockets/$(am__dirstamp) \ psm3/hal_sockets/$(DEPDIR)/$(am__dirstamp) +psm3/$(am__dirstamp): + @$(MKDIR_P) psm3 + @: > psm3/$(am__dirstamp) psm3/libhal_sockets.la: $(psm3_libhal_sockets_la_OBJECTS) $(psm3_libhal_sockets_la_DEPENDENCIES) $(EXTRA_psm3_libhal_sockets_la_DEPENDENCIES) psm3/$(am__dirstamp) $(AM_V_CCLD)$(psm3_libhal_sockets_la_LINK) $(am_psm3_libhal_sockets_la_rpath) $(psm3_libhal_sockets_la_OBJECTS) $(psm3_libhal_sockets_la_LIBADD) $(LIBS) @@ -1949,8 +1821,6 @@ mostlyclean-compile: -rm -f *.$(OBJEXT) -rm -f psm3/*.$(OBJEXT) -rm -f psm3/*.lo - -rm -f psm3/hal_gen1/*.$(OBJEXT) - -rm -f psm3/hal_gen1/*.lo -rm -f psm3/hal_sockets/*.$(OBJEXT) -rm -f psm3/hal_sockets/*.lo -rm -f psm3/hal_verbs/*.$(OBJEXT) @@ -2006,17 +1876,6 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@psm3/$(DEPDIR)/libpsm3i_la-psm_verbs_mr.Plo@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@psm3/$(DEPDIR)/libpsm3i_la-psm_verbs_umrc.Plo@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@psm3/$(DEPDIR)/libpsm3i_la-psmi_wrappers.Plo@am__quote@ # am--include-marker -@AMDEP_TRUE@@am__include@ @am__quote@psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_gdrcpy.Plo@am__quote@ # am--include-marker -@AMDEP_TRUE@@am__include@ @am__quote@psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_hal.Plo@am__quote@ # am--include-marker -@AMDEP_TRUE@@am__include@ @am__quote@psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_i2cflash.Plo@am__quote@ # am--include-marker -@AMDEP_TRUE@@am__include@ @am__quote@psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_proto.Plo@am__quote@ # am--include-marker -@AMDEP_TRUE@@am__include@ @am__quote@psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_ptl_ips.Plo@am__quote@ # am--include-marker -@AMDEP_TRUE@@am__include@ @am__quote@psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_ptl_ips_expected.Plo@am__quote@ # am--include-marker -@AMDEP_TRUE@@am__include@ @am__quote@psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_rcvthread.Plo@am__quote@ # am--include-marker -@AMDEP_TRUE@@am__include@ @am__quote@psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_recvhdrq.Plo@am__quote@ # am--include-marker -@AMDEP_TRUE@@am__include@ @am__quote@psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_sdma.Plo@am__quote@ # am--include-marker -@AMDEP_TRUE@@am__include@ @am__quote@psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_service.Plo@am__quote@ # am--include-marker -@AMDEP_TRUE@@am__include@ @am__quote@psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_utils.Plo@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@psm3/hal_sockets/$(DEPDIR)/libhal_sockets_la-sockets_ep.Plo@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@psm3/hal_sockets/$(DEPDIR)/libhal_sockets_la-sockets_gdrcpy.Plo@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@psm3/hal_sockets/$(DEPDIR)/libhal_sockets_la-sockets_hal.Plo@am__quote@ # am--include-marker @@ -2155,83 +2014,6 @@ am--depfiles: $(am__depfiles_remade) @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $< -psm3/hal_gen1/libhal_gen1_la-gen1_i2cflash.lo: psm3/hal_gen1/gen1_i2cflash.c -@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(psm3_libhal_gen1_la_CPPFLAGS) $(CPPFLAGS) $(psm3_libhal_gen1_la_CFLAGS) $(CFLAGS) -MT psm3/hal_gen1/libhal_gen1_la-gen1_i2cflash.lo -MD -MP -MF psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_i2cflash.Tpo -c -o psm3/hal_gen1/libhal_gen1_la-gen1_i2cflash.lo `test -f 'psm3/hal_gen1/gen1_i2cflash.c' || echo '$(srcdir)/'`psm3/hal_gen1/gen1_i2cflash.c -@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_i2cflash.Tpo psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_i2cflash.Plo -@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='psm3/hal_gen1/gen1_i2cflash.c' object='psm3/hal_gen1/libhal_gen1_la-gen1_i2cflash.lo' libtool=yes @AMDEPBACKSLASH@ -@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ -@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(psm3_libhal_gen1_la_CPPFLAGS) $(CPPFLAGS) $(psm3_libhal_gen1_la_CFLAGS) $(CFLAGS) -c -o psm3/hal_gen1/libhal_gen1_la-gen1_i2cflash.lo `test -f 'psm3/hal_gen1/gen1_i2cflash.c' || echo '$(srcdir)/'`psm3/hal_gen1/gen1_i2cflash.c - -psm3/hal_gen1/libhal_gen1_la-gen1_proto.lo: psm3/hal_gen1/gen1_proto.c -@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(psm3_libhal_gen1_la_CPPFLAGS) $(CPPFLAGS) $(psm3_libhal_gen1_la_CFLAGS) $(CFLAGS) -MT psm3/hal_gen1/libhal_gen1_la-gen1_proto.lo -MD -MP -MF psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_proto.Tpo -c -o psm3/hal_gen1/libhal_gen1_la-gen1_proto.lo `test -f 'psm3/hal_gen1/gen1_proto.c' || echo '$(srcdir)/'`psm3/hal_gen1/gen1_proto.c -@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_proto.Tpo psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_proto.Plo -@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='psm3/hal_gen1/gen1_proto.c' object='psm3/hal_gen1/libhal_gen1_la-gen1_proto.lo' libtool=yes @AMDEPBACKSLASH@ -@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ -@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(psm3_libhal_gen1_la_CPPFLAGS) $(CPPFLAGS) $(psm3_libhal_gen1_la_CFLAGS) $(CFLAGS) -c -o psm3/hal_gen1/libhal_gen1_la-gen1_proto.lo `test -f 'psm3/hal_gen1/gen1_proto.c' || echo '$(srcdir)/'`psm3/hal_gen1/gen1_proto.c - -psm3/hal_gen1/libhal_gen1_la-gen1_sdma.lo: psm3/hal_gen1/gen1_sdma.c -@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(psm3_libhal_gen1_la_CPPFLAGS) $(CPPFLAGS) $(psm3_libhal_gen1_la_CFLAGS) $(CFLAGS) -MT psm3/hal_gen1/libhal_gen1_la-gen1_sdma.lo -MD -MP -MF psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_sdma.Tpo -c -o psm3/hal_gen1/libhal_gen1_la-gen1_sdma.lo `test -f 'psm3/hal_gen1/gen1_sdma.c' || echo '$(srcdir)/'`psm3/hal_gen1/gen1_sdma.c -@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_sdma.Tpo psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_sdma.Plo -@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='psm3/hal_gen1/gen1_sdma.c' object='psm3/hal_gen1/libhal_gen1_la-gen1_sdma.lo' libtool=yes @AMDEPBACKSLASH@ -@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ -@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(psm3_libhal_gen1_la_CPPFLAGS) $(CPPFLAGS) $(psm3_libhal_gen1_la_CFLAGS) $(CFLAGS) -c -o psm3/hal_gen1/libhal_gen1_la-gen1_sdma.lo `test -f 'psm3/hal_gen1/gen1_sdma.c' || echo '$(srcdir)/'`psm3/hal_gen1/gen1_sdma.c - -psm3/hal_gen1/libhal_gen1_la-gen1_service.lo: psm3/hal_gen1/gen1_service.c -@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(psm3_libhal_gen1_la_CPPFLAGS) $(CPPFLAGS) $(psm3_libhal_gen1_la_CFLAGS) $(CFLAGS) -MT psm3/hal_gen1/libhal_gen1_la-gen1_service.lo -MD -MP -MF psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_service.Tpo -c -o psm3/hal_gen1/libhal_gen1_la-gen1_service.lo `test -f 'psm3/hal_gen1/gen1_service.c' || echo '$(srcdir)/'`psm3/hal_gen1/gen1_service.c -@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_service.Tpo psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_service.Plo -@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='psm3/hal_gen1/gen1_service.c' object='psm3/hal_gen1/libhal_gen1_la-gen1_service.lo' libtool=yes @AMDEPBACKSLASH@ -@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ -@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(psm3_libhal_gen1_la_CPPFLAGS) $(CPPFLAGS) $(psm3_libhal_gen1_la_CFLAGS) $(CFLAGS) -c -o psm3/hal_gen1/libhal_gen1_la-gen1_service.lo `test -f 'psm3/hal_gen1/gen1_service.c' || echo '$(srcdir)/'`psm3/hal_gen1/gen1_service.c - -psm3/hal_gen1/libhal_gen1_la-gen1_utils.lo: psm3/hal_gen1/gen1_utils.c -@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(psm3_libhal_gen1_la_CPPFLAGS) $(CPPFLAGS) $(psm3_libhal_gen1_la_CFLAGS) $(CFLAGS) -MT psm3/hal_gen1/libhal_gen1_la-gen1_utils.lo -MD -MP -MF psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_utils.Tpo -c -o psm3/hal_gen1/libhal_gen1_la-gen1_utils.lo `test -f 'psm3/hal_gen1/gen1_utils.c' || echo '$(srcdir)/'`psm3/hal_gen1/gen1_utils.c -@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_utils.Tpo psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_utils.Plo -@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='psm3/hal_gen1/gen1_utils.c' object='psm3/hal_gen1/libhal_gen1_la-gen1_utils.lo' libtool=yes @AMDEPBACKSLASH@ -@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ -@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(psm3_libhal_gen1_la_CPPFLAGS) $(CPPFLAGS) $(psm3_libhal_gen1_la_CFLAGS) $(CFLAGS) -c -o psm3/hal_gen1/libhal_gen1_la-gen1_utils.lo `test -f 'psm3/hal_gen1/gen1_utils.c' || echo '$(srcdir)/'`psm3/hal_gen1/gen1_utils.c - -psm3/hal_gen1/libhal_gen1_la-gen1_gdrcpy.lo: psm3/hal_gen1/gen1_gdrcpy.c -@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(psm3_libhal_gen1_la_CPPFLAGS) $(CPPFLAGS) $(psm3_libhal_gen1_la_CFLAGS) $(CFLAGS) -MT psm3/hal_gen1/libhal_gen1_la-gen1_gdrcpy.lo -MD -MP -MF psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_gdrcpy.Tpo -c -o psm3/hal_gen1/libhal_gen1_la-gen1_gdrcpy.lo `test -f 'psm3/hal_gen1/gen1_gdrcpy.c' || echo '$(srcdir)/'`psm3/hal_gen1/gen1_gdrcpy.c -@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_gdrcpy.Tpo psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_gdrcpy.Plo -@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='psm3/hal_gen1/gen1_gdrcpy.c' object='psm3/hal_gen1/libhal_gen1_la-gen1_gdrcpy.lo' libtool=yes @AMDEPBACKSLASH@ -@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ -@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(psm3_libhal_gen1_la_CPPFLAGS) $(CPPFLAGS) $(psm3_libhal_gen1_la_CFLAGS) $(CFLAGS) -c -o psm3/hal_gen1/libhal_gen1_la-gen1_gdrcpy.lo `test -f 'psm3/hal_gen1/gen1_gdrcpy.c' || echo '$(srcdir)/'`psm3/hal_gen1/gen1_gdrcpy.c - -psm3/hal_gen1/libhal_gen1_la-gen1_hal.lo: psm3/hal_gen1/gen1_hal.c -@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(psm3_libhal_gen1_la_CPPFLAGS) $(CPPFLAGS) $(psm3_libhal_gen1_la_CFLAGS) $(CFLAGS) -MT psm3/hal_gen1/libhal_gen1_la-gen1_hal.lo -MD -MP -MF psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_hal.Tpo -c -o psm3/hal_gen1/libhal_gen1_la-gen1_hal.lo `test -f 'psm3/hal_gen1/gen1_hal.c' || echo '$(srcdir)/'`psm3/hal_gen1/gen1_hal.c -@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_hal.Tpo psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_hal.Plo -@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='psm3/hal_gen1/gen1_hal.c' object='psm3/hal_gen1/libhal_gen1_la-gen1_hal.lo' libtool=yes @AMDEPBACKSLASH@ -@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ -@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(psm3_libhal_gen1_la_CPPFLAGS) $(CPPFLAGS) $(psm3_libhal_gen1_la_CFLAGS) $(CFLAGS) -c -o psm3/hal_gen1/libhal_gen1_la-gen1_hal.lo `test -f 'psm3/hal_gen1/gen1_hal.c' || echo '$(srcdir)/'`psm3/hal_gen1/gen1_hal.c - -psm3/hal_gen1/libhal_gen1_la-gen1_ptl_ips.lo: psm3/hal_gen1/gen1_ptl_ips.c -@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(psm3_libhal_gen1_la_CPPFLAGS) $(CPPFLAGS) $(psm3_libhal_gen1_la_CFLAGS) $(CFLAGS) -MT psm3/hal_gen1/libhal_gen1_la-gen1_ptl_ips.lo -MD -MP -MF psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_ptl_ips.Tpo -c -o psm3/hal_gen1/libhal_gen1_la-gen1_ptl_ips.lo `test -f 'psm3/hal_gen1/gen1_ptl_ips.c' || echo '$(srcdir)/'`psm3/hal_gen1/gen1_ptl_ips.c -@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_ptl_ips.Tpo psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_ptl_ips.Plo -@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='psm3/hal_gen1/gen1_ptl_ips.c' object='psm3/hal_gen1/libhal_gen1_la-gen1_ptl_ips.lo' libtool=yes @AMDEPBACKSLASH@ -@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ -@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(psm3_libhal_gen1_la_CPPFLAGS) $(CPPFLAGS) $(psm3_libhal_gen1_la_CFLAGS) $(CFLAGS) -c -o psm3/hal_gen1/libhal_gen1_la-gen1_ptl_ips.lo `test -f 'psm3/hal_gen1/gen1_ptl_ips.c' || echo '$(srcdir)/'`psm3/hal_gen1/gen1_ptl_ips.c - -psm3/hal_gen1/libhal_gen1_la-gen1_ptl_ips_expected.lo: psm3/hal_gen1/gen1_ptl_ips_expected.c -@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(psm3_libhal_gen1_la_CPPFLAGS) $(CPPFLAGS) $(psm3_libhal_gen1_la_CFLAGS) $(CFLAGS) -MT psm3/hal_gen1/libhal_gen1_la-gen1_ptl_ips_expected.lo -MD -MP -MF psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_ptl_ips_expected.Tpo -c -o psm3/hal_gen1/libhal_gen1_la-gen1_ptl_ips_expected.lo `test -f 'psm3/hal_gen1/gen1_ptl_ips_expected.c' || echo '$(srcdir)/'`psm3/hal_gen1/gen1_ptl_ips_expected.c -@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_ptl_ips_expected.Tpo psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_ptl_ips_expected.Plo -@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='psm3/hal_gen1/gen1_ptl_ips_expected.c' object='psm3/hal_gen1/libhal_gen1_la-gen1_ptl_ips_expected.lo' libtool=yes @AMDEPBACKSLASH@ -@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ -@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(psm3_libhal_gen1_la_CPPFLAGS) $(CPPFLAGS) $(psm3_libhal_gen1_la_CFLAGS) $(CFLAGS) -c -o psm3/hal_gen1/libhal_gen1_la-gen1_ptl_ips_expected.lo `test -f 'psm3/hal_gen1/gen1_ptl_ips_expected.c' || echo '$(srcdir)/'`psm3/hal_gen1/gen1_ptl_ips_expected.c - -psm3/hal_gen1/libhal_gen1_la-gen1_rcvthread.lo: psm3/hal_gen1/gen1_rcvthread.c -@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(psm3_libhal_gen1_la_CPPFLAGS) $(CPPFLAGS) $(psm3_libhal_gen1_la_CFLAGS) $(CFLAGS) -MT psm3/hal_gen1/libhal_gen1_la-gen1_rcvthread.lo -MD -MP -MF psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_rcvthread.Tpo -c -o psm3/hal_gen1/libhal_gen1_la-gen1_rcvthread.lo `test -f 'psm3/hal_gen1/gen1_rcvthread.c' || echo '$(srcdir)/'`psm3/hal_gen1/gen1_rcvthread.c -@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_rcvthread.Tpo psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_rcvthread.Plo -@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='psm3/hal_gen1/gen1_rcvthread.c' object='psm3/hal_gen1/libhal_gen1_la-gen1_rcvthread.lo' libtool=yes @AMDEPBACKSLASH@ -@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ -@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(psm3_libhal_gen1_la_CPPFLAGS) $(CPPFLAGS) $(psm3_libhal_gen1_la_CFLAGS) $(CFLAGS) -c -o psm3/hal_gen1/libhal_gen1_la-gen1_rcvthread.lo `test -f 'psm3/hal_gen1/gen1_rcvthread.c' || echo '$(srcdir)/'`psm3/hal_gen1/gen1_rcvthread.c - -psm3/hal_gen1/libhal_gen1_la-gen1_recvhdrq.lo: psm3/hal_gen1/gen1_recvhdrq.c -@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(psm3_libhal_gen1_la_CPPFLAGS) $(CPPFLAGS) $(psm3_libhal_gen1_la_CFLAGS) $(CFLAGS) -MT psm3/hal_gen1/libhal_gen1_la-gen1_recvhdrq.lo -MD -MP -MF psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_recvhdrq.Tpo -c -o psm3/hal_gen1/libhal_gen1_la-gen1_recvhdrq.lo `test -f 'psm3/hal_gen1/gen1_recvhdrq.c' || echo '$(srcdir)/'`psm3/hal_gen1/gen1_recvhdrq.c -@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_recvhdrq.Tpo psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_recvhdrq.Plo -@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='psm3/hal_gen1/gen1_recvhdrq.c' object='psm3/hal_gen1/libhal_gen1_la-gen1_recvhdrq.lo' libtool=yes @AMDEPBACKSLASH@ -@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ -@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(psm3_libhal_gen1_la_CPPFLAGS) $(CPPFLAGS) $(psm3_libhal_gen1_la_CFLAGS) $(CFLAGS) -c -o psm3/hal_gen1/libhal_gen1_la-gen1_recvhdrq.lo `test -f 'psm3/hal_gen1/gen1_recvhdrq.c' || echo '$(srcdir)/'`psm3/hal_gen1/gen1_recvhdrq.c - psm3/hal_sockets/libhal_sockets_la-sockets_ep.lo: psm3/hal_sockets/sockets_ep.c @am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(psm3_libhal_sockets_la_CPPFLAGS) $(CPPFLAGS) $(psm3_libhal_sockets_la_CFLAGS) $(CFLAGS) -MT psm3/hal_sockets/libhal_sockets_la-sockets_ep.lo -MD -MP -MF psm3/hal_sockets/$(DEPDIR)/libhal_sockets_la-sockets_ep.Tpo -c -o psm3/hal_sockets/libhal_sockets_la-sockets_ep.lo `test -f 'psm3/hal_sockets/sockets_ep.c' || echo '$(srcdir)/'`psm3/hal_sockets/sockets_ep.c @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) psm3/hal_sockets/$(DEPDIR)/libhal_sockets_la-sockets_ep.Tpo psm3/hal_sockets/$(DEPDIR)/libhal_sockets_la-sockets_ep.Plo @@ -3162,7 +2944,6 @@ mostlyclean-libtool: clean-libtool: -rm -rf .libs _libs -rm -rf psm3/.libs psm3/_libs - -rm -rf psm3/hal_gen1/.libs psm3/hal_gen1/_libs -rm -rf psm3/hal_sockets/.libs psm3/hal_sockets/_libs -rm -rf psm3/hal_verbs/.libs psm3/hal_verbs/_libs -rm -rf psm3/ptl_am/.libs psm3/ptl_am/_libs @@ -3509,8 +3290,6 @@ distclean-generic: -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) -rm -f psm3/$(DEPDIR)/$(am__dirstamp) -rm -f psm3/$(am__dirstamp) - -rm -f psm3/hal_gen1/$(DEPDIR)/$(am__dirstamp) - -rm -f psm3/hal_gen1/$(am__dirstamp) -rm -f psm3/hal_sockets/$(DEPDIR)/$(am__dirstamp) -rm -f psm3/hal_sockets/$(am__dirstamp) -rm -f psm3/hal_verbs/$(DEPDIR)/$(am__dirstamp) @@ -3574,17 +3353,6 @@ distclean: distclean-am -rm -f psm3/$(DEPDIR)/libpsm3i_la-psm_verbs_mr.Plo -rm -f psm3/$(DEPDIR)/libpsm3i_la-psm_verbs_umrc.Plo -rm -f psm3/$(DEPDIR)/libpsm3i_la-psmi_wrappers.Plo - -rm -f psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_gdrcpy.Plo - -rm -f psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_hal.Plo - -rm -f psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_i2cflash.Plo - -rm -f psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_proto.Plo - -rm -f psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_ptl_ips.Plo - -rm -f psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_ptl_ips_expected.Plo - -rm -f psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_rcvthread.Plo - -rm -f psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_recvhdrq.Plo - -rm -f psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_sdma.Plo - -rm -f psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_service.Plo - -rm -f psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_utils.Plo -rm -f psm3/hal_sockets/$(DEPDIR)/libhal_sockets_la-sockets_ep.Plo -rm -f psm3/hal_sockets/$(DEPDIR)/libhal_sockets_la-sockets_gdrcpy.Plo -rm -f psm3/hal_sockets/$(DEPDIR)/libhal_sockets_la-sockets_hal.Plo @@ -3765,17 +3533,6 @@ maintainer-clean: maintainer-clean-am -rm -f psm3/$(DEPDIR)/libpsm3i_la-psm_verbs_mr.Plo -rm -f psm3/$(DEPDIR)/libpsm3i_la-psm_verbs_umrc.Plo -rm -f psm3/$(DEPDIR)/libpsm3i_la-psmi_wrappers.Plo - -rm -f psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_gdrcpy.Plo - -rm -f psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_hal.Plo - -rm -f psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_i2cflash.Plo - -rm -f psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_proto.Plo - -rm -f psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_ptl_ips.Plo - -rm -f psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_ptl_ips_expected.Plo - -rm -f psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_rcvthread.Plo - -rm -f psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_recvhdrq.Plo - -rm -f psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_sdma.Plo - -rm -f psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_service.Plo - -rm -f psm3/hal_gen1/$(DEPDIR)/libhal_gen1_la-gen1_utils.Plo -rm -f psm3/hal_sockets/$(DEPDIR)/libhal_sockets_la-sockets_ep.Plo -rm -f psm3/hal_sockets/$(DEPDIR)/libhal_sockets_la-sockets_gdrcpy.Plo -rm -f psm3/hal_sockets/$(DEPDIR)/libhal_sockets_la-sockets_hal.Plo diff --git a/inc/ofi_cuda.h b/inc/ofi_cuda.h deleted file mode 100644 index bba9b37..0000000 --- a/inc/ofi_cuda.h +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright (c) 2020 Amazon.com, Inc. or its affiliates. - * All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#if HAVE_CONFIG_H -#include -#endif /* HAVE_CONFIG_H */ - -#ifndef _OFI_CUDA_H_ -#define _OFI_CUDA_H_ -#if HAVE_CUDA - -#include -#include - -static uint64_t -ofi_copy_cuda_iov_buf(const struct iovec *iov, size_t iov_count, - uint64_t iov_offset, void *buf, - uint64_t bufsize, int dir) -{ - uint64_t done = 0, len; - char *iov_buf; - size_t i; - - for (i = 0; i < iov_count && bufsize; i++) { - len = iov[i].iov_len; - - if (iov_offset > len) { - iov_offset -= len; - continue; - } - - iov_buf = (char *)iov[i].iov_base + iov_offset; - len -= iov_offset; - - len = MIN(len, bufsize); - if (dir == OFI_COPY_BUF_TO_IOV) - cudaMemcpy(iov_buf, (char *) buf + done, len, cudaMemcpyHostToDevice); - else if (dir == OFI_COPY_IOV_TO_BUF) - cudaMemcpy((char *) buf + done, iov_buf, len, cudaMemcpyDeviceToHost); - - iov_offset = 0; - bufsize -= len; - done += len; - } - return done; -} - -static inline uint64_t -ofi_copy_from_cuda_iov(void *buf, uint64_t bufsize, - const struct iovec *iov, size_t iov_count, uint64_t iov_offset) -{ - if (iov_count == 1) { - uint64_t size = ((iov_offset > iov[0].iov_len) ? - 0 : MIN(bufsize, iov[0].iov_len - iov_offset)); - - cudaMemcpy(buf, (char *) iov[0].iov_base + iov_offset, - size, cudaMemcpyDeviceToHost); - return size; - } else { - return ofi_copy_cuda_iov_buf(iov, iov_count, iov_offset, buf, - bufsize, OFI_COPY_IOV_TO_BUF); - } -} - -static inline uint64_t -ofi_copy_to_cuda_iov(const struct iovec *iov, size_t iov_count, uint64_t iov_offset, - void *buf, uint64_t bufsize) -{ - if (iov_count == 1) { - uint64_t size = ((iov_offset > iov[0].iov_len) ? - 0 : MIN(bufsize, iov[0].iov_len - iov_offset)); - cudaMemcpy((char *) iov[0].iov_base + iov_offset, - buf, size, cudaMemcpyHostToDevice); - return size; - } else { - return ofi_copy_cuda_iov_buf(iov, iov_count, iov_offset, buf, - bufsize, OFI_COPY_BUF_TO_IOV); - } -} - -#endif /* HAVE_CUDA */ -#endif /* _OFI_CUDA_H_ */ diff --git a/libpsm3-fi.spec b/libpsm3-fi.spec index 7e26e18..7750840 100644 --- a/libpsm3-fi.spec +++ b/libpsm3-fi.spec @@ -4,7 +4,7 @@ Name: lib%{provider}-fi Version: 11.3.0.0 -Release: 999 +Release: 130 Summary: Dynamic %{provider_formal} provider for Libfabric Group: System Environment/Libraries diff --git a/man/man7/fi_psm3.7 b/man/man7/fi_psm3.7 index 88b988a..8208a12 100644 --- a/man/man7/fi_psm3.7 +++ b/man/man7/fi_psm3.7 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 2.5 .\" -.TH "fi_psm3" "7" "2021\-03\-22" "Libfabric Programmer\[cq]s Manual" "Libfabric v11.2.0.0" +.TH "fi_psm3" "7" "2022\-03\-30" "Libfabric Programmer\[cq]s Manual" "Libfabric v11.3.0.0" .hy .SH NAME .PP @@ -8,9 +8,12 @@ fi_psm3 \- The PSM3 Fabric Provider .SH OVERVIEW .PP The \f[I]psm3\f[R] provider implements a Performance Scaled Messaging -capability which supports Intel RoCEv2 capable NICs. -PSM3 represents an Ethernet and standard RoCEv2 enhancement of previous -PSM implementations. +capability which supports most verbs UD and sockets devices. +Additional features and optimizations can be enabled when running over +Intel\[cq]s E810 Ethernet NICs and/or using Intel\[cq]s rendezvous +kernel module (\f[C]rv\f[R]). +PSM 3.x fully integrates the OFI provider and the underlying PSM3 +protocols/implementation and only exports the OFI APIs. .SH SUPPORTED FEATURES .PP The \f[I]psm3\f[R] provider supports a subset of all the features diff --git a/psm3/Makefile.include b/psm3/Makefile.include index 15d621b..c0266d4 100644 --- a/psm3/Makefile.include +++ b/psm3/Makefile.include @@ -15,7 +15,6 @@ noinst_LTLIBRARIES += \ psm3/libptl_am.la \ psm3/libptl_ips.la \ psm3/libptl_self.la \ - psm3/libhal_gen1.la \ psm3/libhal_verbs.la \ psm3/libhal_sockets.la \ psm3/libpsm3i.la @@ -116,37 +115,6 @@ psm3_libutils_la_CPPFLAGS = \ psm3_libutils_la_CFLAGS = \ $(AM_CFLAGS) $(psm3_CFLAGS) $(_psm3_cflags) -#ifdef PSM_OPA -psm3_libhal_gen1_la_SOURCES = \ - psm3/hal_gen1/gen1_types.h \ - psm3/hal_gen1/gen1_hfi1_deprecated.h \ - psm3/hal_gen1/gen1_common.h \ - psm3/hal_gen1/gen1_i2cflash.c \ - psm3/hal_gen1/gen1_proto.c \ - psm3/hal_gen1/gen1_sdma.c \ - psm3/hal_gen1/gen1_sdma.h \ - psm3/hal_gen1/gen1_service.c \ - psm3/hal_gen1/gen1_service.h \ - psm3/hal_gen1/gen1_user.h \ - psm3/hal_gen1/gen1_utils.c \ - psm3/hal_gen1/gen1_gdrcpy.c \ - psm3/hal_gen1/gen1_hal.c \ - psm3/hal_gen1/gen1_hal.h \ - psm3/hal_gen1/gen1_hal_inline_i.h \ - psm3/hal_gen1/gen1_ptl_ips_subcontext.h \ - psm3/hal_gen1/gen1_ptl_ips_writehdrq.h \ - psm3/hal_gen1/gen1_ptl_ips.c \ - psm3/hal_gen1/gen1_ptl_ips_expected.c \ - psm3/hal_gen1/gen1_rcvthread.c \ - psm3/hal_gen1/gen1_recvhdrq.c \ - psm3/hal_gen1/gen1_spio.h -psm3_libhal_gen1_la_CPPFLAGS = \ - -I$(top_srcdir)/psm3/hal_gen1/ \ - $(AM_CPPFLAGS) $(psm3_CPPFLAGS) $(_psm3_cppflags) -psm3_libhal_gen1_la_CFLAGS = \ - $(AM_CFLAGS) $(psm3_CFLAGS) $(_psm3_cflags) - -#endif PSM_OPA psm3_libhal_verbs_la_SOURCES = \ psm3/hal_verbs/verbs_ep.c \ psm3/hal_verbs/verbs_ep.h \ @@ -255,7 +223,6 @@ psm3_libpsm3i_la_LIBADD = \ psm3/libptl_am.la \ psm3/libptl_ips.la \ psm3/libptl_self.la \ - psm3/libhal_gen1.la \ psm3/libhal_verbs.la \ psm3/libhal_sockets.la @@ -264,13 +231,11 @@ psm3_libpsm3i_la_DEPENDENCIES = \ psm3/libptl_am.la \ psm3/libptl_ips.la \ psm3/libptl_self.la \ - psm3/libhal_gen1.la \ psm3/libhal_verbs.la \ psm3/libhal_sockets.la _psm3_extra_dist = \ psm3/include/psm3_rbtree.c \ - psm3/hal_gen1/gen1_spio.c \ psm3/hal_verbs/verbs_spio.c \ psm3/hal_sockets/sockets_spio.c \ psm3/utils/utils_dwordcpy-x86_64-fast.S @@ -281,7 +246,6 @@ chksum_srcs += \ $(psm3_libptl_ips_la_SOURCES) \ $(psm3_libptl_self_la_SOURCES) \ $(psm3_libutils_la_SOURCES) \ - $(psm3_libhal_gen1_la_SOURCES) \ $(psm3_libhal_verbs_la_SOURCES) \ $(psm3_libhal_sockets_la_SOURCES) \ $(psm3_libpsm3i_la_SOURCES) \ diff --git a/psm3/hal_gen1/gen1_common.h b/psm3/hal_gen1/gen1_common.h deleted file mode 100644 index ad66e94..0000000 --- a/psm3/hal_gen1/gen1_common.h +++ /dev/null @@ -1,64 +0,0 @@ -#ifdef PSM_OPA -/* - - This file is provided under a dual BSD/GPLv2 license. When using or - redistributing this file, you may do so under either license. - - GPL LICENSE SUMMARY - - Copyright(c) 2015 Intel Corporation. - - This program is free software; you can redistribute it and/or modify - it under the terms of version 2 of the GNU General Public License as - published by the Free Software Foundation. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - Contact Information: - Intel Corporation, www.intel.com - - BSD LICENSE - - Copyright(c) 2015 Intel Corporation. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ - -#ifndef PSM_HAL_GEN1_COMMON_H -#define PSM_HAL_GEN1_COMMON_H - -#include -#include "gen1_hfi1_deprecated.h" - -#endif /* PSM_HAL_GEN1_COMMON_H */ -#endif /* PSM_OPA */ diff --git a/psm3/hal_gen1/gen1_gdrcpy.c b/psm3/hal_gen1/gen1_gdrcpy.c deleted file mode 100644 index 6090895..0000000 --- a/psm3/hal_gen1/gen1_gdrcpy.c +++ /dev/null @@ -1,236 +0,0 @@ -#ifdef PSM_OPA -/* - - This file is provided under a dual BSD/GPLv2 license. When using or - redistributing this file, you may do so under either license. - - GPL LICENSE SUMMARY - - Copyright(c) 2018 Intel Corporation. - - This program is free software; you can redistribute it and/or modify - it under the terms of version 2 of the GNU General Public License as - published by the Free Software Foundation. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - Contact Information: - Intel Corporation, www.intel.com - - BSD LICENSE - - Copyright(c) 2018 Intel Corporation. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ -#ifdef PSM_CUDA -#include "psm_user.h" -#include "psm2_hal.h" -#include -#include -#include -#include "ips_proto.h" -#include "ptl_ips/ips_tid.h" -#include "ptl_ips/ips_expected_proto.h" -#include "gen1_user.h" -#include "ptl_ips.h" -#include "gen1_hal.h" - -static int gdr_fd; - -// Note: ep->epaddr->proto is always NULL. ep->epaddr only has epid -// so we must navigate the ep->ptl_ips to get ips_proto -static inline -struct ips_proto *psm3_gen1_get_proto(psm2_ep_t ep) -{ - struct ips_proto *proto = &((struct ptl_ips*)(ep->ptl_ips.ptl))->proto; - psmi_assert(ep == proto->ep); - return proto; -} - -uint64_t -psm3_gen1_gdr_cache_evict() { - int ret; - struct hfi1_gdr_cache_evict_params params; - params.evict_params_in.version = HFI1_GDR_VERSION; - params.evict_params_in.pages_to_evict = 4; - - ret = ioctl(gdr_fd, HFI1_IOCTL_GDR_GPU_CACHE_EVICT, ¶ms); - if (ret) { - /* Fatal error */ - psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, - "PIN/MMAP ioctl failed ret %d errno %d\n", - ret, errno); - return ret; - } - - return params.evict_params_out.pages_evicted; -} - - -static uint64_t -psm3_gen1_sdma_gpu_cache_evict(int fd) { - int ret; - struct hfi1_sdma_gpu_cache_evict_params params; - params.evict_params_in.version = HFI1_GDR_VERSION; - params.evict_params_in.pages_to_evict = 2; - - ret = ioctl(fd, HFI1_IOCTL_SDMA_CACHE_EVICT, ¶ms); - if (ret) { - /* Fatal error */ - psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, - "SDMA Cache Evict failed ret %d errno %d\n", - ret, errno); - return ret; - } - - return params.evict_params_out.pages_evicted; -} - -/* handle_out_of_bar_space is called when the driver tries - * to self evict in the GDR cache and finds no entries. - * This could be due to the fact that all the pages pinned - * in the BAR1 region are cached in the SDMA and TID cache. - * We try to evict from both the caches for 30 seconds after - * which we bail out. If successful we retry to PIN/MMAP once - * again - */ -static uint64_t -handle_out_of_bar_space(struct ips_proto *proto) -{ - time_t lastEvictTime = 0; - uint64_t lengthEvicted; - time_t now; - retry: - now = time(NULL); - - if (!lastEvictTime) - lastEvictTime = now; - - if (proto->protoexp && proto->protoexp->tidc.tid_cachemap.payload.nidle) { - lengthEvicted = - ips_tidcache_evict(&proto->protoexp->tidc, -1); - - if (lengthEvicted) { - lastEvictTime = 0; - return lengthEvicted; /* signals a retry of the writev command. */ - } - } - - lengthEvicted = psm3_gen1_sdma_gpu_cache_evict(psm3_gen1_get_fd(proto->ep->context.psm_hw_ctxt)); - if (lengthEvicted) { - lastEvictTime = 0; - return lengthEvicted; - } - static const double thirtySeconds = 30.0; - if (difftime(now, lastEvictTime) > - thirtySeconds) { - return 0; - } else { - goto retry; - } -} - -// flags=0 for send, 1 for recv -void * -psm3_gen1_gdr_convert_gpu_to_host_addr(unsigned long buf, - size_t size, int flags, - psm2_ep_t ep) -{ - struct hfi1_gdr_query_params query_params; - int ret; - void *host_addr_buf; - - uintptr_t pageaddr = buf & GPU_PAGE_MASK; - /* As size is guarenteed to be in the range of 0-8kB - * there is a guarentee that buf+size-1 does not overflow - * 64 bits. - */ - uint32_t pagelen = (uint32_t) (PSMI_GPU_PAGESIZE + - ((buf + size - 1) & GPU_PAGE_MASK) - - pageaddr); - - psmi_assert(NULL != psm3_gen1_get_proto(ep)); - _HFI_VDBG("buf=%p size=%zu pageaddr=%p pagelen=%u flags=0x%x ep=%p\n", - (void *)buf, size, (void *)pageaddr, pagelen, flags, ep); - query_params.query_params_in.version = HFI1_GDR_VERSION; - query_params.query_params_in.gpu_buf_addr = pageaddr; - query_params.query_params_in.gpu_buf_size = pagelen; - retry: - - ret = ioctl(gdr_fd, HFI1_IOCTL_GDR_GPU_PIN_MMAP, &query_params); - - if (ret) { - if (errno == ENOMEM || errno == EINVAL) { - if (!handle_out_of_bar_space(psm3_gen1_get_proto(ep))) { - /* Fatal error */ - psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, - "Unable to PIN GPU pages(Out of BAR1 space) (errno: %d)\n", errno); - return NULL; - } else { - goto retry; - } - } else { - /* Fatal error */ - psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, - "PIN/MMAP ioctl failed ret %d errno %d\n", - ret, errno); - return NULL; - } - } - host_addr_buf = (void *)query_params.query_params_out.host_buf_addr; - return host_addr_buf + (buf & GPU_PAGE_OFFSET_MASK); -} - -void psm3_hfp_gen1_gdr_open() -{ - gdr_fd = open(GDR_DEVICE_PATH, O_RDWR); - if (-1 == gdr_fd ) { - /* Non-Fatal error. If device cannot be found we assume - * that the driver does not support GDR Copy and we fallback - * to sending all GPU messages using rndv protocol - */ - _HFI_INFO(" Warning: The HFI1 driver installed does not support GPUDirect RDMA" - " fast copy. Turning off GDR fast copy in PSM \n"); - is_gdr_copy_enabled = gdr_copy_limit_send = - gdr_copy_limit_recv = 0; - return; - } - return; -} - -void psm3_gen1_gdr_close() -{ - close(gdr_fd); -} - -#endif /* PSM_CUDA */ -#endif /* PSM_OPA */ diff --git a/psm3/hal_gen1/gen1_hal.c b/psm3/hal_gen1/gen1_hal.c deleted file mode 100644 index c54319d..0000000 --- a/psm3/hal_gen1/gen1_hal.c +++ /dev/null @@ -1,367 +0,0 @@ -#ifdef PSM_OPA -/* - - This file is provided under a dual BSD/GPLv2 license. When using or - redistributing this file, you may do so under either license. - - GPL LICENSE SUMMARY - - Copyright(c) 2017 Intel Corporation. - - This program is free software; you can redistribute it and/or modify - it under the terms of version 2 of the GNU General Public License as - published by the Free Software Foundation. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - Contact Information: - Intel Corporation, www.intel.com - - BSD LICENSE - - Copyright(c) 2017 Intel Corporation. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "psm_user.h" -#include "psm2_hal.h" -#include "gen1_user.h" - -#if PSMI_HAL_INST_CNT > 1 || defined(PSM_DEBUG) -// declare all the HAL_INLINE functions and pull in implementation as non-inline -#define PSMI_HAL_CAT_INL_SYM(KERNEL) psm3_hfp_gen1_ ## KERNEL -#include "psm2_hal_inline_t.h" -#include "gen1_hal_inline_i.h" -#endif - -static int psm3_hfp_gen1_initialize(psmi_hal_instance_t *phi, - int devid_enabled[PTL_MAX_INIT]) -{ - /* psm3_hal_current_hal_instance is not yet initialized, so - * we can't call psmi_hal_* routines to set cap or sw_status - */ - - /* we initialize a few HAL software specific capabilities which - * are known before context_open can open RV or parse HAL specific - * env variables. Additional flags may be added to cap_mask by - * context_open. - * Any flags which influence PSM env variable parsing prior to - * context_open must be set here - */ - phi->params.cap_mask = 0; - -#if 0 - // this may have been an OPA bug, but may be hiding other bugs - // This was guarded by a test of PSM_HAL_CAP_HDRSUPP, however that cap_mask - // is not set until context_open so this code was never run and - // the PSM_HAL_HDRSUPP_ENABLED sw_status was never set. Error handling code - // for packet sequence errors uses if_pf testing PSM_HAL_HDRSUPP_ENABLED - { - union psmi_envvar_val env_hdrsupp; - - psm3_getenv("PSM3_HDRSUPP", - "Receive header suppression. Default is 1 (enabled)," - " 0 to disable.\n", - PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, - (union psmi_envvar_val)1, &env_hdrsupp); - if (env_hdrsupp.e_uint) - phi->params.sw_status |= PSM_HAL_HDRSUPP_ENABLED; - } -#endif - - return 0; -} - -/* functions called vis DISPATCH_FUNC */ -static int psm3_hfp_gen1_finalize_(void) -{ - return 0; -} - -static const char* psm3_hfp_gen1_identify(void) -{ - static char buf[100]; - -/* we test NVIDIA_GPU_DIRECT here instead of PSM_CUDA since that define - * controls the hfi1 header file interface - */ - snprintf(buf, sizeof(buf), "HAL: %s (%s) built against driver interface v%d.%d" -#ifdef NVIDIA_GPU_DIRECT - " gpu cuda" -#endif - , - psmi_hal_get_hal_instance_name(), - psmi_hal_get_hal_instance_description(), - HFI1_USER_SWMAJOR, HFI1_USER_SWMINOR); - return buf; -} - -// used as domain.name for fi_info -static const char *psm3_hfp_gen1_get_unit_name(int unit) -{ - return psm3_sysfs_unit_dev_name(unit); -} - -// used as fabric.name for fi_info -static int psm3_hfp_gen1_get_port_subnet_name(int unit, int port, int addr_index, char *buf, size_t bufsize) -{ - psmi_subnet128_t subnet; - - if (psm3_hfp_gen1_get_port_subnet(unit, 1, addr_index, &subnet, NULL, NULL, NULL)) - return -1; - - psm3_subnet128_fmt_name(subnet, buf, bufsize); - return 0; -} - -static int psm3_hfp_gen1_get_port_lid(int unit, int port, int addr_index) -{ - return psm3_gen1_get_port_lid(unit, port, addr_index, GEN1_FILTER); -} - -// initialize default MQ thresholds -// This is called prior to parsing PSM3_ env variables for MQ and -// also prior to the EP being opened (eg. NIC not yet selected). -static void psm3_hfp_gen1_mq_init_defaults(struct psm2_mq *mq) -{ - unsigned rdmamode = psm3_gen1_parse_tid(1); - - /* These values may be changed by initialize_params if user specifies - * corresponding PSM3_* env variables. - * Otherwise these defaults are used. - */ - if(psm3_cpu_model == CPUID_MODEL_PHI_GEN2 || psm3_cpu_model == CPUID_MODEL_PHI_GEN2M) - { - mq->hfi_thresh_rv = 200000; - mq->hfi_base_window_rv = 4194304; - } else { - mq->hfi_thresh_rv = 64000; - mq->hfi_base_window_rv = 131072; - } - // hfi_base_window_rv may be further reduced in protoexp_init to account - // for max TID resources allowed per IO - - // reload env var cache once per MQ so don't report in VERBOSE_ENV per rail - if (! (rdmamode & IPS_PROTOEXP_FLAG_ENABLED)) { - // Retain existing gen1 behavior and leave rendezvous enabled. It - // will use LONG_DATA mechanism which provides receive side pacing - //mq->hfi_thresh_rv = (~(uint32_t)0); // disable rendezvous - } - mq->hfi_thresh_tiny = PSM_MQ_NIC_MAX_TINY; -#ifdef PSM_CUDA - if (PSMI_IS_GPU_ENABLED) - mq->hfi_base_window_rv = 2097152; -#endif -} - -// initialize default EP Open options -// This is called in psm3_ep_open_internal prior to parsing PSM3_ env variables -// and also prior to the EP being opened (eg. NIC not yet selected). -static void psm3_hfp_gen1_ep_open_opts_get_defaults(struct psm3_ep_open_opts *opts) -{ - opts->imm_size = 128; -} - -static void psm3_hfp_gen1_context_initstats(psm2_ep_t ep) -{ - // Noop -} - -/* functions called vis DISPATCH_PI */ -static int psm3_hfp_gen1_get_num_ports(void) -{ - return HFI_NUM_PORTS_GEN1; -} - -static int psm3_hfp_gen1_get_unit_active(int unit) -{ - return psm3_gen1_get_unit_active(unit, GEN1_FILTER); -} - -static int psm3_hfp_gen1_get_num_free_contexts(int unit) -{ - int64_t nfreectxts=0; - - if (!psm3_sysfs_unit_read_s64(unit, "nfreectxts", - &nfreectxts, 0)) - { - return (int)nfreectxts; - } - return -PSM_HAL_ERROR_GENERAL_ERROR; -} - -static int psm3_hfp_gen1_get_default_pkey(void) -{ - return 0x8001; /* fabric default pkey for app traffic */ -} - -static int psm3_hfp_gen1_get_unit_pci_bus(int unit, uint32_t *domain, - uint32_t *bus, uint32_t *device, uint32_t *function) -{ - return psm3_sysfs_get_unit_pci_bus(unit, domain, bus, device, function); -} - -static int psm3_hfp_gen1_get_unit_device_id(int unit, char *buf, size_t bufsize) -{ - return psm3_sysfs_get_unit_device_id(unit, buf, bufsize); -} - -static int psm3_hfp_gen1_get_unit_device_version(int unit, char *buf, size_t bufsize) -{ - return psm3_sysfs_get_unit_device_version(unit, buf, bufsize); -} - -static int psm3_hfp_gen1_get_unit_vendor_id(int unit, char *buf, size_t bufsize) -{ - return psm3_sysfs_get_unit_vendor_id(unit, buf, bufsize); -} - -static int psm3_hfp_gen1_get_unit_driver(int unit, char *buf, size_t bufsize) -{ - return psm3_sysfs_get_unit_driver(unit, buf, bufsize); -} - -/* define the singleton that implements hal for gen1 */ -static hfp_gen1_t psm3_gen1_hi = { - /* start of public psmi_hal_instance_t data */ - .phi = { - .hal_index = PSM_HAL_INDEX_OPA, - .description = "OPA100" -#ifdef PSM_CUDA - " (cuda)" -#endif - , - .nic_sys_class_path = "/sys/class/infiniband", - .nic_sys_port_path_fmt = PSM3_PORT_PATH_TYPE_IB, - .params = {0}, - - /* functions called directly, no DISPATCH macro */ - .hfp_initialize = psm3_hfp_gen1_initialize, - .hfp_have_active_unit = psm3_hfp_gen1_have_active_unit, - - /* called via DISPATCH_FUNC */ - .hfp_finalize_ = psm3_hfp_gen1_finalize_, - .hfp_identify = psm3_hfp_gen1_identify, - .hfp_get_unit_name = psm3_hfp_gen1_get_unit_name, - .hfp_get_port_subnet_name = psm3_hfp_gen1_get_port_subnet_name, - .hfp_get_port_speed = psm3_hfp_gen1_get_port_speed, - .hfp_get_port_lid = psm3_hfp_gen1_get_port_lid, - .hfp_mq_init_defaults = psm3_hfp_gen1_mq_init_defaults, - .hfp_ep_open_opts_get_defaults = psm3_hfp_gen1_ep_open_opts_get_defaults, - .hfp_context_initstats = psm3_hfp_gen1_context_initstats, -#ifdef PSM_CUDA - .hfp_gdr_open = psm3_hfp_gen1_gdr_open, -#endif - - /* called via DISPATCH_PI */ - .hfp_get_num_units = psm3_hfp_gen1_get_num_units, - .hfp_get_num_ports = psm3_hfp_gen1_get_num_ports, - .hfp_get_unit_active = psm3_hfp_gen1_get_unit_active, - .hfp_get_port_active = psm3_hfp_gen1_get_port_active, - .hfp_get_num_contexts = psm3_hfp_gen1_get_num_contexts, - .hfp_get_num_free_contexts = psm3_hfp_gen1_get_num_free_contexts, - .hfp_get_default_pkey = psm3_hfp_gen1_get_default_pkey, - .hfp_get_port_subnet = psm3_hfp_gen1_get_port_subnet, - .hfp_get_unit_pci_bus = psm3_hfp_gen1_get_unit_pci_bus, - .hfp_get_unit_device_id = psm3_hfp_gen1_get_unit_device_id, - .hfp_get_unit_device_version = psm3_hfp_gen1_get_unit_device_version, - .hfp_get_unit_vendor_id = psm3_hfp_gen1_get_unit_vendor_id, - .hfp_get_unit_driver = psm3_hfp_gen1_get_unit_driver, - - /* called via DISPATCH, may be inline */ -#if PSMI_HAL_INST_CNT > 1 || defined(PSM_DEBUG) - .hfp_context_open = psm3_hfp_gen1_context_open, - .hfp_close_context = psm3_hfp_gen1_close_context, - .hfp_context_check_status = psm3_hfp_gen1_context_check_status, -#ifdef PSM_FI - .hfp_faultinj_allowed = psm3_hfp_gen1_faultinj_allowed, -#endif - .hfp_ips_ptl_init_pre_proto_init = psm3_hfp_gen1_ips_ptl_init_pre_proto_init, - .hfp_ips_ptl_init_post_proto_init = psm3_hfp_gen1_ips_ptl_init_post_proto_init, - .hfp_ips_ptl_fini = psm3_hfp_gen1_ips_ptl_fini, - .hfp_ips_proto_init = psm3_hfp_gen1_ips_proto_init, - .hfp_ips_proto_update_linkinfo = psm3_hfp_gen1_ips_proto_update_linkinfo, - .hfp_ips_fully_connected = psm3_hfp_gen1_ips_fully_connected, - .hfp_ips_ipsaddr_set_req_params = psm3_hfp_gen1_ips_ipsaddr_set_req_params, - .hfp_ips_ipsaddr_process_connect_reply = psm3_hfp_gen1_ips_ipsaddr_process_connect_reply, - .hfp_ips_proto_build_connect_message = psm3_hfp_gen1_ips_proto_build_connect_message, - .hfp_ips_ipsaddr_init_addressing = psm3_hfp_gen1_ips_ipsaddr_init_addressing, - .hfp_ips_ipsaddr_init_connections = psm3_hfp_gen1_ips_ipsaddr_init_connections, - .hfp_ips_ipsaddr_free = psm3_hfp_gen1_ips_ipsaddr_free, - .hfp_ips_flow_init = psm3_hfp_gen1_ips_flow_init, - .hfp_ips_ipsaddr_disconnect = psm3_hfp_gen1_ips_ipsaddr_disconnect, - .hfp_ips_ibta_init = psm3_hfp_gen1_ips_ibta_init, - .hfp_ips_path_rec_init = psm3_hfp_gen1_ips_path_rec_init, - .hfp_ips_ptl_pollintr = psm3_hfp_gen1_ips_ptl_pollintr, -#ifdef PSM_CUDA - .hfp_gdr_close = psm3_hfp_gen1_gdr_close, - .hfp_gdr_convert_gpu_to_host_addr = psm3_hfp_gen1_gdr_convert_gpu_to_host_addr, -#endif /* PSM_CUDA */ - .hfp_get_port_index2pkey = psm3_hfp_gen1_get_port_index2pkey, - .hfp_poll_type = psm3_hfp_gen1_poll_type, - .hfp_free_tid = psm3_hfp_gen1_free_tid, - .hfp_get_tidcache_invalidation = psm3_hfp_gen1_get_tidcache_invalidation, - .hfp_update_tid = psm3_hfp_gen1_update_tid, - .hfp_tidflow_check_update_pkt_seq = psm3_hfp_gen1_tidflow_check_update_pkt_seq, - .hfp_tidflow_get = psm3_hfp_gen1_tidflow_get, - .hfp_tidflow_get_hw = psm3_hfp_gen1_tidflow_get_hw, - .hfp_tidflow_get_seqnum = psm3_hfp_gen1_tidflow_get_seqnum, - .hfp_tidflow_reset = psm3_hfp_gen1_tidflow_reset, - .hfp_tidflow_set_entry = psm3_hfp_gen1_tidflow_set_entry, - .hfp_get_hfi_event_bits = psm3_hfp_gen1_get_hfi_event_bits, - .hfp_spio_transfer_frame = psm3_hfp_gen1_spio_transfer_frame, - .hfp_transfer_frame = psm3_hfp_gen1_transfer_frame, - .hfp_dma_send_pending_scbs = psm3_hfp_gen1_dma_send_pending_scbs, - .hfp_drain_sdma_completions = psm3_hfp_gen1_drain_sdma_completions, - .hfp_get_node_id = psm3_hfp_gen1_get_node_id, - .hfp_get_jkey = psm3_hfp_gen1_get_jkey, - .hfp_get_pio_size = psm3_hfp_gen1_get_pio_size, - .hfp_get_pio_stall_cnt = psm3_hfp_gen1_get_pio_stall_cnt, - .hfp_get_subctxt = psm3_hfp_gen1_get_subctxt, - .hfp_get_subctxt_cnt = psm3_hfp_gen1_get_subctxt_cnt, - .hfp_get_tid_exp_cnt = psm3_hfp_gen1_get_tid_exp_cnt, - .hfp_set_pkey = psm3_hfp_gen1_set_pkey, -#endif /* PSMI_HAL_INST_CNT > 1 || defined(PSM_DEBUG) */ - }, - /* start of private hfp_gen1_private data */ - .hfp_private = { - .sdmahdr_req_size = 0, - .dma_rtail = 0, - .hdrq_rhf_off = 0, - } -}; - -static void __attribute__ ((constructor)) __psmi_hal_gen1_constructor(void) -{ - psm3_hal_register_instance((psmi_hal_instance_t*)&psm3_gen1_hi); -} -#endif /* PSM_OPA */ diff --git a/psm3/hal_gen1/gen1_hal.h b/psm3/hal_gen1/gen1_hal.h deleted file mode 100644 index 590efc7..0000000 --- a/psm3/hal_gen1/gen1_hal.h +++ /dev/null @@ -1,620 +0,0 @@ -#ifdef PSM_OPA -/* - - This file is provided under a dual BSD/GPLv2 license. When using or - redistributing this file, you may do so under either license. - - GPL LICENSE SUMMARY - - Copyright(c) 2017 Intel Corporation. - - This program is free software; you can redistribute it and/or modify - it under the terms of version 2 of the GNU General Public License as - published by the Free Software Foundation. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - Contact Information: - Intel Corporation, www.intel.com - - BSD LICENSE - - Copyright(c) 2017 Intel Corporation. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#ifndef _PSM_HAL_GEN1_HAL_H -#define _PSM_HAL_GEN1_HAL_H - -#include "psm_user.h" -#include "ips_proto.h" -#include "ips_proto_internal.h" -#include "gen1_spio.h" -#include "gen1_sdma.h" -#include "psm_mq_internal.h" -#include "gen1_user.h" -#include "gen1_ptl_ips_subcontext.h" - -COMPILE_TIME_ASSERT(MAX_SHARED_CTXTS_MUST_MATCH, PSM_HAL_MAX_SHARED_CTXTS == HFI1_MAX_SHARED_CTXTS); - -/* Private struct on a per-context basis. */ -typedef struct _hfp_gen1_pc_private -{ - struct _hfi_ctrl *ctrl; /* driver opaque hfi_proto */ - psm3_gen1_cl_q_t cl_qs[PSM3_GEN1_GET_SC_CL_Q_RX_EGR_Q(7) + 1]; - struct gen1_ips_hwcontext_ctrl *hwcontext_ctrl; - struct gen1_ips_subcontext_ureg *subcontext_ureg[HFI1_MAX_SHARED_CTXTS]; - struct psm3_gen1_spio spio_ctrl; - struct hfi1_user_info_dep user_info; - uint16_t sc2vl[PSMI_N_SCS]; -} hfp_gen1_pc_private; - -/* declare the hfp_gen1_private struct */ -typedef struct _hfp_gen1_private -{ - /* GEN1 specific data that are common to all contexts: */ - int sdmahdr_req_size; - int dma_rtail; - uint32_t hdrq_rhf_off; -} hfp_gen1_private_t; - -/* declare hfp_gen1_t struct, (combines public psmi_hal_instance_t - together with a private struct) */ -typedef struct _hfp_gen1 -{ - psmi_hal_instance_t phi; - hfp_gen1_private_t hfp_private; -} hfp_gen1_t; - -static inline struct _hfp_gen1 *get_psm_gen1_hi(void) -{ - return (struct _hfp_gen1*) psm3_hal_current_hal_instance; -} - -const char* psm3_gen1_identify(void); - -static inline -uint32_t -psm3_gen1_get_ht(volatile uint64_t *ht_register) -{ - uint64_t res = *ht_register; - ips_rmb(); - return (uint32_t)res; -} - -void psm3_gen1_ips_ptl_dump_err_stats(struct ips_proto *proto); - -static inline -void -psm3_gen1_set_ht(volatile uint64_t *ht_register, uint64_t new_ht) -{ - *ht_register = new_ht; - return; -} - -/* Getter for cl q head indexes: */ -static inline psm3_gen1_cl_idx psm3_gen1_get_cl_q_head_index( - psm3_gen1_cl_q cl_q, - psmi_hal_hw_context ctxt) -{ - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - - return psm3_gen1_get_ht(psm_hw_ctxt->cl_qs[cl_q].cl_q_head); -} - -/* Getter for cl q tail indexes: */ -static inline psm3_gen1_cl_idx psm3_gen1_get_cl_q_tail_index( - psm3_gen1_cl_q cl_q, - psmi_hal_hw_context ctxt) -{ - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - - return psm3_gen1_get_ht(psm_hw_ctxt->cl_qs[cl_q].cl_q_tail); -} - -/* Setter for cl q head indexes: */ -static inline void psm3_gen1_set_cl_q_head_index( - psm3_gen1_cl_idx idx, - psm3_gen1_cl_q cl_q, - psmi_hal_hw_context ctxt) -{ - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - - psm3_gen1_set_ht(psm_hw_ctxt->cl_qs[cl_q].cl_q_head, idx); - return; -} - -/* Setter for cl q tail indexes: */ -static inline void psm3_gen1_set_cl_q_tail_index( - psm3_gen1_cl_idx idx, - psm3_gen1_cl_q cl_q, - psmi_hal_hw_context ctxt) -{ - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - - psm3_gen1_set_ht(psm_hw_ctxt->cl_qs[cl_q].cl_q_tail, idx); - return; -} - -/* Indicate whether the cl q is empty. - When this returns > 0 the cl q is empty. - When this returns == 0, the cl q is NOT empty (there are packets in the - circular list that are available to receive). - When this returns < 0, an error occurred. - the parameter should correspond to the head index of the - cl q circular list. */ -static inline int psm3_gen1_cl_q_empty(psm3_gen1_cl_idx head_idx, - psm3_gen1_cl_q cl_q, - psmi_hal_hw_context ctxt) -{ - if (!get_psm_gen1_hi()->hfp_private.dma_rtail) - { - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - psm3_gen1_cl_q_t *pcl_q = &psm_hw_ctxt->cl_qs[cl_q]; - int seq = psm3_gen1_hdrget_seq(pcl_q->hdr_qe.hdrq_base_addr + - (head_idx + get_psm_gen1_hi()->hfp_private.hdrq_rhf_off)); - - return (*pcl_q->hdr_qe.p_rx_hdrq_rhf_seq != seq); - } - - return (head_idx == psm3_gen1_get_cl_q_tail_index(cl_q, ctxt)); -} - -/* Returns expected sequence number for RHF. */ -static inline int psm3_gen1_get_rhf_expected_sequence_number(unsigned int *pseqnum, - psm3_gen1_cl_q cl_q, - psmi_hal_hw_context ctxt) - -{ - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - psm3_gen1_cl_q_t *pcl_q = &psm_hw_ctxt->cl_qs[cl_q]; - - *pseqnum = *pcl_q->hdr_qe.p_rx_hdrq_rhf_seq; - return PSM_HAL_ERROR_OK; -} - -/* Sets expected sequence number for RHF. */ -static inline int psm3_gen1_set_rhf_expected_sequence_number(unsigned int seqnum, - psm3_gen1_cl_q cl_q, - psmi_hal_hw_context ctxt) - -{ - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - psm3_gen1_cl_q_t *pcl_q = &psm_hw_ctxt->cl_qs[cl_q]; - - *pcl_q->hdr_qe.p_rx_hdrq_rhf_seq = seqnum; - return PSM_HAL_ERROR_OK; -} - -/* Checks sequence number from RHF. Returns PSM_HAL_ERROR_OK if the sequence number is good - * returns something else if the sequence number is bad. */ -static inline int psm3_gen1_check_rhf_sequence_number(unsigned int seqno) -{ - return (seqno <= LAST_RHF_SEQNO) ? - PSM_HAL_ERROR_OK : - PSM_HAL_ERROR_GENERAL_ERROR; -} - -static inline int psm3_gen1_get_rx_egr_tid_cnt(psmi_hal_hw_context ctxt) -{ - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; - - return ctrl->ctxt_info.egrtids; -} - -static inline int psm3_gen1_get_rx_hdr_q_cnt(psmi_hal_hw_context ctxt) -{ - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; - - return ctrl->ctxt_info.rcvhdrq_cnt; -} - -static inline int psm3_gen1_get_rx_hdr_q_ent_size(psmi_hal_hw_context ctxt) -{ - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; - - return ctrl->ctxt_info.rcvhdrq_entsize; -} - -/* Retire the given head idx of the header q, and change *head_idx to point to the next - entry, lastly set *empty to indicate whether the headerq is empty at the new - head_idx. */ -static inline int psm3_gen1_retire_hdr_q_entry(psm3_gen1_cl_idx *idx, - psm3_gen1_cl_q cl_q, - psmi_hal_hw_context ctxt, - uint32_t elemsz, uint32_t elemlast, - int *emptyp) -{ - psm3_gen1_cl_idx tmp = *idx + elemsz; - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - psm3_gen1_cl_q_t *pcl_q = &psm_hw_ctxt->cl_qs[cl_q]; - - if (!get_psm_gen1_hi()->hfp_private.dma_rtail) - { - (*pcl_q->hdr_qe.p_rx_hdrq_rhf_seq)++; - if (*pcl_q->hdr_qe.p_rx_hdrq_rhf_seq > LAST_RHF_SEQNO) - *pcl_q->hdr_qe.p_rx_hdrq_rhf_seq = 1; - } - if_pf(tmp > elemlast) - tmp = 0; - *emptyp = psm3_gen1_cl_q_empty(tmp, cl_q, ctxt); - *idx = tmp; - return PSM_HAL_ERROR_OK; -} - -static inline void psm3_gen1_get_ips_message_hdr(psm3_gen1_cl_idx idx, - psm3_gen1_raw_rhf_t rhf, - struct ips_message_header **imhp, - psm3_gen1_cl_q cl_q, - psmi_hal_hw_context ctxt) -{ - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - psm3_gen1_cl_q_t *pcl_q = &psm_hw_ctxt->cl_qs[cl_q]; - uint32_t *pu32 = pcl_q->hdr_qe.hdrq_base_addr + (idx + psm3_gen1_hdrget_hdrq_offset((uint32_t *)&rhf)); - *imhp = (struct ips_message_header*)pu32; -} - -static inline void psm3_gen1_get_rhf(psm3_gen1_cl_idx idx, - psm3_gen1_raw_rhf_t *rhfp, - psm3_gen1_cl_q cl_q, - psmi_hal_hw_context ctxt) - -{ - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - psm3_gen1_cl_q_t *pcl_q = &psm_hw_ctxt->cl_qs[cl_q]; - uint32_t *pu32 = (pcl_q->hdr_qe.hdrq_base_addr + - (idx + get_psm_gen1_hi()->hfp_private.hdrq_rhf_off)); - *rhfp = *((psm3_gen1_raw_rhf_t*)pu32); -} - -/* Deliver an eager buffer given the index. - * If the index does not refer to a current egr buffer, get_egr_buff() - * returns NULL. - */ -static inline void *psm3_gen1_get_egr_buff(psm3_gen1_cl_idx idx, - psm3_gen1_cl_q cl_q, - psmi_hal_hw_context ctxt) -{ - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - psm3_gen1_cl_q_t *pcl_q = &psm_hw_ctxt->cl_qs[cl_q]; - return pcl_q->egr_buffs[idx]; -} - -/* Receive the raw rhf, decompose it, and then receive the ips_message_hdr. */ -/* caller has already initialized rcv_ev->proto, rcv_ev->recvq, - * and rcv_ev->gen1_hdr_q - */ -static inline int psm3_gen1_get_receive_event(psm3_gen1_cl_idx head_idx, psmi_hal_hw_context ctxt, int get_payload, - struct ips_recvhdrq_event *rcv_ev) -{ - psm3_gen1_get_rhf(head_idx, &rcv_ev->gen1_rhf.raw_rhf, rcv_ev->gen1_hdr_q, ctxt); - - /* here, we turn off the TFSEQ err bit if set: */ - rcv_ev->gen1_rhf.decomposed_rhf = rcv_ev->gen1_rhf.raw_rhf & (~(PSM3_GEN1_RHF_ERR_MASK_64(TFSEQ))); - - /* Now, get the lrh: */ - psm3_gen1_get_ips_message_hdr(head_idx, rcv_ev->gen1_rhf.raw_rhf, &rcv_ev->p_hdr, - rcv_ev->gen1_hdr_q, ctxt); - - // TBD - OPA computed this for CCA scan too, but not needed - // could put this within if get_payload below, but placed it here - // to faithfully duplicate the original OPA algorithm - rcv_ev->has_cksum = ((rcv_ev->proto->flags & IPS_PROTO_FLAG_CKSUM) && - (rcv_ev->p_hdr->flags & IPS_SEND_FLAG_PKTCKSUM)); - - // for FECN/BECN scan we don't need payload_size nor payload - // we are inline and caller passes a const, so this if test will - // optimize out. - if (get_payload) { - /* Compromise for better HAL API. For OPA, payload_size is not - * needed for TINY messages, getting payload_size and len here - * adds a few instructions to message rate critical path, but - * allows all the HALs to consistently set rcv_ev->payload_size - * and rcv_ev->payload in recvhdrq_progress and eliminates - * need for OPA specific ips_recvhdrq_event_paylen and - * payload functions. - */ - uint32_t cksum_len = rcv_ev->has_cksum ? PSM_CRC_SIZE_IN_BYTES : 0; - - rcv_ev->payload_size = psm3_gen1_rhf_get_packet_length(rcv_ev->gen1_rhf) - - (sizeof(struct ips_message_header) + - HFI_CRC_SIZE_IN_BYTES + cksum_len); - /* PSM does not use bth0].PadCnt, it figures out real datalen other way */ - - if (psm3_gen1_rhf_get_use_egr_buff(rcv_ev->gen1_rhf)) - rcv_ev->payload = (uint8_t*)(psm3_gen1_get_egr_buff( - psm3_gen1_rhf_get_egr_buff_index(rcv_ev->gen1_rhf), - (psm3_gen1_cl_q)(rcv_ev->gen1_hdr_q + 1) /* The circular list q - (cl_q) for the egr buff for any rx - hdrq event is always one more than - the hdrq cl q */, - rcv_ev->recvq->context->psm_hw_ctxt))+ - (psm3_gen1_rhf_get_egr_buff_offset(rcv_ev->gen1_rhf)*64); - else - rcv_ev->payload = NULL; - } - - /* If the hdrq_head is before cachedlastscan, that means that we have - * already prescanned this for BECNs and FECNs, so we should not check - * again - */ - if_pt((rcv_ev->proto->flags & IPS_PROTO_FLAG_CCA) && - (head_idx >= rcv_ev->recvq->state->hdrq_cachedlastscan)) { - /* IBTA CCA handling: - * If FECN bit set handle IBTA CCA protocol. For the - * flow that suffered congestion we flag it to generate - * a control packet with the BECN bit set - This is - * currently an unsolicited ACK. - * - * For all MQ packets the FECN processing/BECN - * generation is done in the is_expected_or_nak - * function as each eager packet is inspected there. - * - * For TIDFLOW/Expected data transfers the FECN - * bit/BECN generation is done in protoexp_data. Since - * header suppression can result in even FECN packets - * being suppressed the expected protocol generated - * additional BECN packets if a "large" number of - * generations are swapped without progress being made - * for receive. "Large" is set empirically to 4. - * - * FECN packets are ignored for all control messages - * (except ACKs and NAKs) since they indicate - * congestion on the control path which is not rate - * controlled. The CCA specification allows FECN on - * ACKs to be disregarded as well. - */ - - rcv_ev->is_congested = - _is_cca_fecn_set(rcv_ev-> - p_hdr) & IPS_RECV_EVENT_FECN; - rcv_ev->is_congested |= - (_is_cca_becn_set(rcv_ev->p_hdr) << - (IPS_RECV_EVENT_BECN - 1)); - } else - rcv_ev->is_congested = 0; - - return PSM_HAL_ERROR_OK; -} - -/* At the end of each scb struct, we have space reserved to accommodate - * three structures (for GEN1)- - * struct psm_hal_sdma_req_info, struct psm_hal_pbc and struct ips_message_header. - * The HIC should get the size needed for the extended memory region - * using a HAL call (psmi_hal_get_scb_extended_mem_size). For Gen1, this API - * will return the size of the below struct psm_hal_gen1_scb_extended - * aligned up to be able to fit struct psm_hal_pbc on a 64-byte boundary. - */ - -#define PSMI_SHARED_CONTEXTS_ENABLED_BY_DEFAULT 1 - -struct psm_hal_gen1_scb_extended { - union - { - struct sdma_req_info sri1; - struct sdma_req_info_v6_3 sri2; - }; - struct { - struct psm_hal_pbc pbc; - struct ips_message_header ips_lrh; - } PSMI_CACHEALIGN; -}; - -static const struct -{ - uint32_t hfi1_event_bit, psmi_hal_hfi_event_bit; -} hfi1_events_map[] = -{ - { HFI1_EVENT_FROZEN, PSM_HAL_HFI_EVENT_FROZEN }, - { HFI1_EVENT_LINKDOWN, PSM_HAL_HFI_EVENT_LINKDOWN }, - { HFI1_EVENT_LID_CHANGE, PSM_HAL_HFI_EVENT_LID_CHANGE }, - { HFI1_EVENT_LMC_CHANGE, PSM_HAL_HFI_EVENT_LMC_CHANGE }, - { HFI1_EVENT_SL2VL_CHANGE, PSM_HAL_HFI_EVENT_SL2VL_CHANGE }, - { HFI1_EVENT_TID_MMU_NOTIFY, PSM_HAL_HFI_EVENT_TID_MMU_NOTIFY}, -}; - -psm2_error_t psm3_gen1_ips_ptl_init_pre_proto_init(struct ptl_ips *ptl); -psm2_error_t psm3_gen1_ips_ptl_init_post_proto_init(struct ptl_ips *ptl); -psm2_error_t psm3_gen1_ips_ptl_fini(struct ptl_ips *ptl); -void psm3_gen1_ips_ptl_init_sl2sc_table(struct ips_proto *proto); -psm2_error_t psm3_gen1_ptl_ips_update_linkinfo(struct ips_proto *proto); - -psm2_error_t psm3_gen1_ips_ptl_pollintr(psm2_ep_t ep, - struct ips_recvhdrq *recvq, int fd_pipe, int next_timeout, - uint64_t *pollok, uint64_t *pollcyc); - -int psm3_gen1_ips_ptl_process_err_chk_gen(struct ips_recvhdrq_event *rcv_ev); -int psm3_gen1_ips_ptl_process_becn(struct ips_recvhdrq_event *rcv_ev); -int psm3_gen1_ips_ptl_process_unknown(const struct ips_recvhdrq_event *rcv_ev); -int psm3_gen1_ips_ptl_process_packet_error(struct ips_recvhdrq_event *rcv_ev); -unsigned psm3_gen1_parse_tid(int reload); - -psm2_error_t -psm3_gen1_recvhdrq_init(const psmi_context_t *context, - const struct ips_epstate *epstate, - const struct ips_proto *proto, - const struct ips_recvhdrq_callbacks *callbacks, - uint32_t subcontext, - struct ips_recvhdrq *recvq - , struct ips_recvhdrq_state *recvq_state, - psm3_gen1_cl_q cl_q - ); - -psm2_error_t psm3_gen1_recvhdrq_progress(struct ips_recvhdrq *recvq); - - /* This function is designed to implement RAPID CCA. It iterates - * through the recvq, checking each element for set FECN or BECN bits. - * In the case of finding one, the proper response is executed, and the bits - * are cleared. - */ -psm2_error_t psm3_gen1_recvhdrq_scan_cca(struct ips_recvhdrq *recvq); - -PSMI_INLINE(int psm3_gen1_recvhdrq_isempty(const struct ips_recvhdrq *recvq)) -{ - return psm3_gen1_cl_q_empty(recvq->state->hdrq_head, - recvq->gen1_cl_hdrq, - recvq->context->psm_hw_ctxt); -} - -#ifdef PSM_CUDA -void psm3_hfp_gen1_gdr_open(void); -void psm3_gen1_gdr_close(void); -void* psm3_gen1_gdr_convert_gpu_to_host_addr(unsigned long buf, - size_t size, int flags, psm2_ep_t ep); -uint64_t psm3_gen1_gdr_cache_evict(void); -#endif /* PSM_CUDA */ - -/* Get pbc static rate value for flow for a given message length */ -PSMI_ALWAYS_INLINE( -uint16_t -psm3_gen1_pbc_static_rate(struct ips_proto *proto, struct ips_flow *flow, - uint32_t msgLen)) -{ - uint32_t rate = 0; - - /* The PBC rate is based on which HFI type as different media have different - * mechanism for static rate control. - */ - - switch (proto->epinfo.ep_hfi_type) { - case PSMI_HFI_TYPE_OPA1: - { - /* - * time_to_send is: - * - * (packet_length) [bits] / (pkt_egress_rate) [bits/sec] - * ----------------------------------------------------- - * fabric_clock_period == (1 / 805 * 10^6) [1/sec] - * - * (where pkt_egress_rate is assumed to be 100 Gbit/s.) - */ - uint32_t time_to_send = (8 * msgLen * 805) / (100000); - rate = (time_to_send >> flow->path->opa.pr_cca_divisor) * - (flow->path->opa.pr_active_ipd); - - if (rate > 65535) - rate = 65535; - - } - break; - - default: - rate = 0; - } - - return (uint16_t) rate; -} - -/* This is a helper function to convert Per Buffer Control to little-endian */ -PSMI_ALWAYS_INLINE( -void psm3_gen1_pbc_to_le(struct psm_hal_pbc *pbc)) -{ - pbc->pbc0 = __cpu_to_le32(pbc->pbc0); - pbc->PbcStaticRateControlCnt = __cpu_to_le16(pbc->PbcStaticRateControlCnt); - pbc->fill1 = __cpu_to_le16(pbc->fill1); -} - -/* Set PBC struct that lies within the extended memory region of SCB */ -/* This is used for PIO and SDMA cases; pbc is really a pointer to - * struct ips_pbc_header * or the equivalent un-named structure - * in ips_scb. Please note pcb will be in little-endian byte - * order on return */ -PSMI_ALWAYS_INLINE( -void -psm3_gen1_pbc_update(struct ips_proto *proto, struct ips_flow *flow, - uint32_t isCtrlMsg, struct psm_hal_pbc *pbc, uint32_t hdrlen, - uint32_t paylen)) -{ - hfp_gen1_pc_private *psm_hw_ctxt = proto->ep->context.psm_hw_ctxt; - int dw = (sizeof(struct psm_hal_pbc) + hdrlen + paylen) >> BYTE2DWORD_SHIFT; - int sc = proto->sl2sc[flow->path->pr_sl]; - int vl = psm_hw_ctxt->sc2vl[sc]; - uint16_t static_rate = 0; - - if_pf(!isCtrlMsg && flow->path->opa.pr_active_ipd) - static_rate = - psm3_gen1_pbc_static_rate(proto, flow, hdrlen + paylen); - - pbc->pbc0 = __cpu_to_le32((dw & HFI_PBC_LENGTHDWS_MASK) | - ((vl & HFI_PBC_VL_MASK) << HFI_PBC_VL_SHIFT) | - (((sc >> HFI_PBC_SC4_SHIFT) & - HFI_PBC_SC4_MASK) << HFI_PBC_DCINFO_SHIFT)); - - pbc->PbcStaticRateControlCnt = __cpu_to_le16(static_rate & HFI_PBC_STATICRCC_MASK); - - /* Per Buffer Control must be in little-endian */ - psm3_gen1_pbc_to_le(pbc); - - return; -} - -PSMI_ALWAYS_INLINE( -int psm3_gen1_get_sdma_ring_size(psmi_hal_hw_context ctxt)) -{ - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; - - return ctrl->ctxt_info.sdma_ring_size; -} - -PSMI_ALWAYS_INLINE( -int psm3_gen1_get_fd(psmi_hal_hw_context ctxt)) -{ - if (!ctxt) - return -1; - - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - - return psm_hw_ctxt->ctrl->fd; -} - -PSMI_ALWAYS_INLINE( -int psm3_gen1_hfi_reset_context(psmi_hal_hw_context ctxt)) -{ - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; - - return psm3_gen1_nic_reset_context(ctrl); -} - -PSMI_ALWAYS_INLINE(int psm3_gen1_get_context(psmi_hal_hw_context ctxt)) -{ - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; - - return ctrl->ctxt_info.ctxt; -} -#endif /* _PSM_HAL_GEN1_HAL_H */ -#endif /* PSM_OPA */ diff --git a/psm3/hal_gen1/gen1_hal_inline_i.h b/psm3/hal_gen1/gen1_hal_inline_i.h deleted file mode 100644 index a6cb44e..0000000 --- a/psm3/hal_gen1/gen1_hal_inline_i.h +++ /dev/null @@ -1,1653 +0,0 @@ -#ifdef PSM_OPA -/* - - This file is provided under a dual BSD/GPLv2 license. When using or - redistributing this file, you may do so under either license. - - GPL LICENSE SUMMARY - - Copyright(c) 2017 Intel Corporation. - - This program is free software; you can redistribute it and/or modify - it under the terms of version 2 of the GNU General Public License as - published by the Free Software Foundation. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - Contact Information: - Intel Corporation, www.intel.com - - BSD LICENSE - - Copyright(c) 2017 Intel Corporation. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "gen1_hal.h" - -static PSMI_HAL_INLINE int psm3_hfp_gen1_get_jkey(psm2_ep_t ep); - -extern size_t psm3_gen1_arrsz[MAPSIZE_MAX]; - -static void psm3_gen1_free_egr_buffs(hfp_gen1_pc_private *psm_hw_ctxt) -{ -#define FREE_EGR_BUFFS_TABLE(cl_qs_arr, index) psm3_ips_recvq_egrbuf_table_free(((cl_qs_arr)[index]).egr_buffs) - size_t i, index, subctxt_cnt; - psm3_gen1_cl_q_t *cl_qs; - - cl_qs = psm_hw_ctxt->cl_qs; - index = PSM3_GEN1_CL_Q_RX_EGR_Q; - FREE_EGR_BUFFS_TABLE(cl_qs, index); - - subctxt_cnt = psm_hw_ctxt->user_info.subctxt_cnt; - for (i = 0; i < subctxt_cnt; i++) { - index = PSM3_GEN1_GET_SC_CL_Q_RX_EGR_Q(i); - FREE_EGR_BUFFS_TABLE(cl_qs, index); - } -#undef FREE_EGR_BUFFS_TABLE -} - -static void psm3_gen1_unmap_hfi_mem(hfp_gen1_pc_private *psm_hw_ctxt) -{ - size_t subctxt_cnt = psm_hw_ctxt->user_info.subctxt_cnt; - struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; - struct hfi1_base_info *binfo = &ctrl->base_info; - struct hfi1_ctxt_info *cinfo = &ctrl->ctxt_info; - - /* 1. Unmap the PIO credits address */ - HFI_MUNMAP_ERRCHECK(binfo, sc_credits_addr, psm3_gen1_arrsz[SC_CREDITS]); - - /* 2. Unmap the PIO buffer SOP address */ - HFI_MUNMAP_ERRCHECK(binfo, pio_bufbase_sop, psm3_gen1_arrsz[PIO_BUFBASE_SOP]); - - /* 3. Unmap the PIO buffer address */ - HFI_MUNMAP_ERRCHECK(binfo, pio_bufbase, psm3_gen1_arrsz[PIO_BUFBASE]); - - /* 4. Unmap the receive header queue */ - HFI_MUNMAP_ERRCHECK(binfo, rcvhdr_bufbase, psm3_gen1_arrsz[RCVHDR_BUFBASE]); - - /* 5. Unmap the receive eager buffer */ - HFI_MUNMAP_ERRCHECK(binfo, rcvegr_bufbase, psm3_gen1_arrsz[RCVEGR_BUFBASE]); - - /* 6. Unmap the sdma completion queue */ - HFI_MUNMAP_ERRCHECK(binfo, sdma_comp_bufbase, psm3_gen1_arrsz[SDMA_COMP_BUFBASE]); - - /* 7. Unmap RXE per-context CSRs */ - HFI_MUNMAP_ERRCHECK(binfo, user_regbase, psm3_gen1_arrsz[USER_REGBASE]); - ctrl->__hfi_rcvhdrtail = NULL; - ctrl->__hfi_rcvhdrhead = NULL; - ctrl->__hfi_rcvegrtail = NULL; - ctrl->__hfi_rcvegrhead = NULL; - ctrl->__hfi_rcvofftail = NULL; - if (cinfo->runtime_flags & HFI1_CAP_HDRSUPP) { - ctrl->__hfi_rcvtidflow = NULL; - } - - /* 8. Unmap the rcvhdrq tail register address */ - if (cinfo->runtime_flags & HFI1_CAP_DMA_RTAIL) { - /* only unmap the RTAIL if it was enabled in the first place */ - HFI_MUNMAP_ERRCHECK(binfo, rcvhdrtail_base, psm3_gen1_arrsz[RCVHDRTAIL_BASE]); - } else { - binfo->rcvhdrtail_base = 0; - } - - /* 9. Unmap the event page */ - HFI_MUNMAP_ERRCHECK(binfo, events_bufbase, psm3_gen1_arrsz[EVENTS_BUFBASE]); - - /* 10. Unmap the status page */ - HFI_MUNMAP_ERRCHECK(binfo, status_bufbase, psm3_gen1_arrsz[STATUS_BUFBASE]); - - /* 11. If subcontext is used, unmap the buffers */ - if (subctxt_cnt > 0) { - /* only unmap subcontext-related stuff it subcontexts are enabled */ - HFI_MUNMAP_ERRCHECK(binfo, subctxt_uregbase, psm3_gen1_arrsz[SUBCTXT_UREGBASE]); - HFI_MUNMAP_ERRCHECK(binfo, subctxt_rcvhdrbuf, psm3_gen1_arrsz[SUBCTXT_RCVHDRBUF]); - HFI_MUNMAP_ERRCHECK(binfo, subctxt_rcvegrbuf, psm3_gen1_arrsz[SUBCTXT_RCVEGRBUF]); - } -} - -#include "gen1_spio.c" - -static PSMI_HAL_INLINE int psm3_hfp_gen1_close_context(psm2_ep_t ep) -{ - hfp_gen1_pc_private *psm_hw_ctxt = (hfp_gen1_pc_private *)(ep->context.psm_hw_ctxt); - - if (!psm_hw_ctxt) - return PSM_HAL_ERROR_OK; - /* Free the egress buffers */ - psm3_gen1_free_egr_buffs(psm_hw_ctxt); - - /* Unmap the HFI memory */ - psm3_gen1_unmap_hfi_mem(psm_hw_ctxt); - - /* Clean up the rest */ - close(psm_hw_ctxt->ctrl->fd); - free(psm_hw_ctxt->ctrl); - psmi_free(psm_hw_ctxt); - ep->context.psm_hw_ctxt = 0; - - return PSM_HAL_ERROR_OK; -} - -/* Check NIC and context status, returns one of - * - * PSM2_OK: Port status is ok (or context not initialized yet but still "ok") - * PSM2_OK_NO_PROGRESS: Cable pulled - * PSM2_EP_NO_NETWORK: No network, no lid, ... - * PSM2_EP_DEVICE_FAILURE: Chip failures, rxe/txe parity, etc. - */ -static PSMI_HAL_INLINE psm2_error_t psm3_hfp_gen1_context_check_status(struct ptl_ips *ptl) -{ - psm2_error_t err = psm3_gen1_context_check_hw_status(ptl->ep); - if (err == PSM2_OK || err == PSM2_OK_NO_PROGRESS) - { - int rc = psm3_gen1_spio_process_events((struct ptl *)ptl); - err = rc >= 0 ? PSM2_OK : PSM2_INTERNAL_ERR; - } - return err; -} - -#ifdef PSM_FI -static PSMI_HAL_INLINE int psm3_hfp_gen1_faultinj_allowed(const char *name, - psm2_ep_t ep) -{ - return 1; -} -#endif - -/* Moved from psm_context.c */ - -ustatic PSMI_HAL_INLINE -int MOCKABLE(psm3_gen1_sharedcontext_params)(int *nranks, int *rankid); -MOCK_DCL_EPILOGUE(psm3_gen1_sharedcontext_params); -ustatic PSMI_HAL_INLINE psm2_error_t psm3_gen1_init_userinfo_params(psm2_ep_t ep, - int unit_id, - psm2_uuid_t const unique_job_key, - struct hfi1_user_info_dep *user_info); - -/* - * Prepare user_info params for driver open, used only in psm3_context_open - */ -ustatic PSMI_HAL_INLINE -psm2_error_t -psm3_gen1_init_userinfo_params(psm2_ep_t ep, int unit_id, - psm2_uuid_t const unique_job_key, - struct hfi1_user_info_dep *user_info) -{ - // TBD - known issue, when HAL is built as pure inline - // can't declare static variables in an inline function - // (and shouldn't delcare in a header file in general) - /* static variables, shared among rails */ - static int shcontexts_enabled = -1, rankid, nranks; - - int avail_contexts = 0, max_contexts, ask_contexts; - int ranks_per_context = 0; - psm2_error_t err = PSM2_OK; - union psmi_envvar_val env_maxctxt, env_ranks_per_context; - static int subcontext_id_start; - - memset(user_info, 0, sizeof(*user_info)); - user_info->userversion = HFI1_USER_SWMINOR|(psm3_gen1_get_user_major_version()<subctxt_id = 0; - user_info->subctxt_cnt = 0; - memcpy(user_info->uuid, unique_job_key, sizeof(user_info->uuid)); - - if (shcontexts_enabled == -1) { - shcontexts_enabled = - psm3_gen1_sharedcontext_params(&nranks, &rankid); - } - if (!shcontexts_enabled) - return err; - - avail_contexts = psm3_hfp_gen1_get_num_contexts(unit_id); - - if (avail_contexts == 0) { - err = psm3_handle_error(NULL, PSM2_EP_NO_DEVICE, - "PSM3 found 0 available contexts on opa device(s)."); - goto fail; - } - - /* See if the user wants finer control over context assignments */ - if (!psm3_getenv("PSM3_MAX_CONTEXTS_PER_JOB", - "Maximum number of contexts for this PSM3 job", - PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, - (union psmi_envvar_val)avail_contexts, &env_maxctxt)) { - max_contexts = max(env_maxctxt.e_int, 1); /* needs to be non-negative */ - ask_contexts = min(max_contexts, avail_contexts); /* needs to be available */ - } else if (!psm3_getenv("PSM3_SHAREDCONTEXTS_MAX", - "", /* deprecated */ - PSMI_ENVVAR_LEVEL_HIDDEN | PSMI_ENVVAR_LEVEL_NEVER_PRINT, - PSMI_ENVVAR_TYPE_INT, - (union psmi_envvar_val)avail_contexts, &env_maxctxt)) { - - _HFI_INFO - ("The PSM3_SHAREDCONTEXTS_MAX env variable is deprecated. Please use PSM3_MAX_CONTEXTS_PER_JOB in future.\n"); - - max_contexts = max(env_maxctxt.e_int, 1); /* needs to be non-negative */ - ask_contexts = min(max_contexts, avail_contexts); /* needs to be available */ - } else - ask_contexts = max_contexts = avail_contexts; - - if (!psm3_getenv("PSM3_RANKS_PER_CONTEXT", - "Number of ranks per context", - PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, - (union psmi_envvar_val)1, &env_ranks_per_context)) { - ranks_per_context = max(env_ranks_per_context.e_int, 1); - ranks_per_context = min(ranks_per_context, HFI1_MAX_SHARED_CTXTS); - } - - /* - * See if we could get a valid ppn. If not, approximate it to be the - * number of cores. - */ - if (nranks == -1) { - long nproc = sysconf(_SC_NPROCESSORS_ONLN); - if (nproc < 1) - nranks = 1; - else - nranks = nproc; - } - - /* - * Make sure that our guesses are good educated guesses - */ - if (rankid >= nranks) { - _HFI_PRDBG - ("PSM3_SHAREDCONTEXTS disabled because lrank=%d,ppn=%d\n", - rankid, nranks); - goto fail; - } - - if (ranks_per_context) { - int contexts = - (nranks + ranks_per_context - 1) / ranks_per_context; - if (contexts > ask_contexts) { - err = psm3_handle_error(NULL, PSM2_EP_NO_DEVICE, - "Incompatible settings for " - "PSM3_MAX_CONTEXTS_PER_JOB and PSM3_RANKS_PER_CONTEXT"); - goto fail; - } - ask_contexts = contexts; - } - - /* group id based on total groups and local rank id */ - user_info->subctxt_id = subcontext_id_start + rankid % ask_contexts; - /* this is for multi-rail, when we setup a new rail, - * we can not use the same subcontext ID as the previous - * rail, otherwise, the driver will match previous rail - * and fail. - */ - subcontext_id_start += ask_contexts; - - /* Need to compute with how many *other* peers we will be sharing the - * context */ - if (nranks > ask_contexts) { - user_info->subctxt_cnt = nranks / ask_contexts; - /* If ppn != multiple of contexts, some contexts get an uneven - * number of subcontexts */ - if (nranks % ask_contexts > rankid % ask_contexts) - user_info->subctxt_cnt++; - /* The case of 1 process "sharing" a context (giving 1 subcontext) - * is supcontexted by the driver and PSM. However, there is no - * need to share in this case so disable context sharing. */ - if (user_info->subctxt_cnt == 1) - user_info->subctxt_cnt = 0; - if (user_info->subctxt_cnt > HFI1_MAX_SHARED_CTXTS) { - err = psm3_handle_error(NULL, PSM2_INTERNAL_ERR, - "Calculation of subcontext count exceeded maximum supported"); - goto fail; - } - } - /* else subcontext_cnt remains 0 and context sharing is disabled. */ - - _HFI_PRDBG("PSM3_SHAREDCONTEXTS lrank=%d,ppn=%d,avail_contexts=%d," - "max_contexts=%d,ask_contexts=%d," - "ranks_per_context=%d,id=%u,cnt=%u\n", - rankid, nranks, avail_contexts, max_contexts, - ask_contexts, ranks_per_context, - user_info->subctxt_id, user_info->subctxt_cnt); -fail: - return err; -} - -ustatic -int MOCKABLE(psm3_gen1_sharedcontext_params)(int *nranks, int *rankid) -{ - union psmi_envvar_val enable_shcontexts; - - *rankid = -1; - *nranks = -1; - - /* We do not support context sharing for multiple endpoints */ - if (psm3_multi_ep_enabled) { - return 0; - } - - /* New name in 2.0.1, keep observing old name */ - psm3_getenv("PSM3_SHAREDCONTEXTS", "Enable shared contexts", - PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_YESNO, - (union psmi_envvar_val) - PSMI_SHARED_CONTEXTS_ENABLED_BY_DEFAULT, - &enable_shcontexts); - if (!enable_shcontexts.e_int) - return 0; - - if (psm3_get_mylocalrank() >= 0 && psm3_get_mylocalrank_count() >= 0) { - *rankid = psm3_get_mylocalrank(); - *nranks = psm3_get_mylocalrank_count(); - return 1; - } else - return 0; -} -MOCK_DEF_EPILOGUE(psm3_gen1_sharedcontext_params); - -/* moved from ips_subcontext.c */ -static PSMI_HAL_INLINE psm2_error_t -psm3_gen1_divvy_shared_mem_ptrs(hfp_gen1_pc_private *pc_private, - psmi_context_t *context, - const struct hfi1_base_info *base_info) -{ - struct gen1_ips_hwcontext_ctrl **hwcontext_ctrl = &pc_private->hwcontext_ctrl; - uint32_t subcontext_cnt = pc_private->user_info.subctxt_cnt; - struct gen1_ips_subcontext_ureg **uregp = &pc_private->subcontext_ureg[0]; - - uintptr_t all_subcontext_uregbase = - (uintptr_t) base_info->subctxt_uregbase; - int i; - - psmi_assert_always(all_subcontext_uregbase != 0); - for (i = 0; i < HFI1_MAX_SHARED_CTXTS; i++) { - struct gen1_ips_subcontext_ureg *subcontext_ureg = - (struct gen1_ips_subcontext_ureg *)all_subcontext_uregbase; - *uregp++ = (i < subcontext_cnt) ? subcontext_ureg : NULL; - all_subcontext_uregbase += sizeof(struct gen1_ips_subcontext_ureg); - } - - *hwcontext_ctrl = - (struct gen1_ips_hwcontext_ctrl *)all_subcontext_uregbase; - all_subcontext_uregbase += sizeof(struct gen1_ips_hwcontext_ctrl); - - context->spio_ctrl = (void *)all_subcontext_uregbase; - all_subcontext_uregbase += sizeof(struct psm3_gen1_spio_ctrl); - - context->tid_ctrl = (void *)all_subcontext_uregbase; - all_subcontext_uregbase += sizeof(struct ips_tid_ctrl); - - context->tf_ctrl = (void *)all_subcontext_uregbase; - all_subcontext_uregbase += sizeof(struct ips_tf_ctrl); - - psmi_assert((all_subcontext_uregbase - - (uintptr_t) base_info->subctxt_uregbase) <= PSMI_PAGESIZE); - - return PSM2_OK; -} - -static PSMI_HAL_INLINE -uint64_t psm3_gen1_get_cap_mask(uint64_t gen1_mask) -{ - // TBD - known issue, when HAL is built as pure inline - // can't declare static variables in an inline function - // (and shouldn't delcare in a header file in general) - static const struct - { - uint64_t gen1_bit; - uint32_t psmi_hal_bit; - } bit_map[] = - { - { HFI1_CAP_SDMA, PSM_HAL_CAP_SDMA }, - { HFI1_CAP_SDMA_AHG, PSM_HAL_CAP_SDMA_AHG }, - { HFI1_CAP_EXTENDED_PSN, PSM_HAL_CAP_EXTENDED_PSN }, - { HFI1_CAP_HDRSUPP, PSM_HAL_CAP_HDRSUPP }, - { HFI1_CAP_USE_SDMA_HEAD, PSM_HAL_CAP_USE_SDMA_HEAD }, - { HFI1_CAP_MULTI_PKT_EGR, PSM_HAL_CAP_MULTI_PKT_EGR }, - { HFI1_CAP_NODROP_RHQ_FULL, PSM_HAL_CAP_NODROP_RHQ_FULL }, - { HFI1_CAP_NODROP_EGR_FULL, PSM_HAL_CAP_NODROP_EGR_FULL }, - { HFI1_CAP_TID_UNMAP, PSM_HAL_CAP_TID_UNMAP }, - { HFI1_CAP_PRINT_UNIMPL, PSM_HAL_CAP_PRINT_UNIMPL }, - { HFI1_CAP_ALLOW_PERM_JKEY, PSM_HAL_CAP_ALLOW_PERM_JKEY }, - { HFI1_CAP_NO_INTEGRITY, PSM_HAL_CAP_NO_INTEGRITY }, - { HFI1_CAP_PKEY_CHECK, PSM_HAL_CAP_PKEY_CHECK }, - { HFI1_CAP_STATIC_RATE_CTRL, PSM_HAL_CAP_STATIC_RATE_CTRL }, - { HFI1_CAP_SDMA_HEAD_CHECK, PSM_HAL_CAP_SDMA_HEAD_CHECK }, - { HFI1_CAP_EARLY_CREDIT_RETURN, PSM_HAL_CAP_EARLY_CREDIT_RETURN }, -#ifdef HFI1_CAP_GPUDIRECT_OT - { HFI1_CAP_GPUDIRECT_OT, PSM_HAL_CAP_GPUDIRECT }, - { HFI1_CAP_GPUDIRECT_OT, PSM_HAL_CAP_GPUDIRECT_RDMA }, -#else /* #ifdef HFI1_CAP_GPUDIRECT_OT */ -#ifndef PSM_CUDA - /* lifted from hfi1_user.h */ - { (1UL << 63), PSM_HAL_CAP_GPUDIRECT }, - { (1UL << 63), PSM_HAL_CAP_GPUDIRECT_RDMA }, -#else /* #ifndef PSM_CUDA */ -#error "Inconsistent build. HFI1_CAP_GPUDIRECT_OT must be defined for CUDA builds. Must use CUDA enabled driver headers" -#endif /* #ifndef PSM_CUDA */ -#endif /* #ifdef HFI1_CAP_GPUDIRECT_OT */ - }; - uint64_t rv = 0; - int i; - for (i=0;i < sizeof(bit_map)/sizeof(bit_map[0]);i++) - { - if (bit_map[i].gen1_bit & gen1_mask) - rv |= bit_map[i].psmi_hal_bit; - } - return rv; -} - -static PSMI_HAL_INLINE int psm3_hfp_gen1_context_open(int unit, - int port, int addr_index, - uint64_t open_timeout, - psm2_ep_t ep, - psm2_uuid_t const job_key, - unsigned retryCnt) -{ - psm2_error_t err = PSM2_OK; - int fd = -1; - psmi_context_t *psm_ctxt = &ep->context; - hfp_gen1_pc_private *pc_private = psmi_malloc(ep, UNDEFINED, sizeof(hfp_gen1_pc_private)); - - psmi_assert_always(!ep->context.psm_hw_ctxt); - psmi_assert_always(psm3_epid_zero_internal(ep->epid)); - if_pf (!pc_private) { - //err = -PSM_HAL_ERROR_CANNOT_OPEN_CONTEXT; - goto bail_fd; - } - - memset(pc_private, 0, sizeof(hfp_gen1_pc_private)); - - ep->rdmamode = psm3_gen1_parse_tid(0); - // MR cache N/A (gen1 uses TID cache), leave ep->mr_cache_mode and - // ep->rv_gpu_cache_size as set by caller (NONE, 0) - - char dev_name[PATH_MAX]; - fd = psm3_gen1_nic_context_open_ex(unit, port, open_timeout, - dev_name, sizeof(dev_name)); - if (fd < 0) - { - err = -PSM_HAL_ERROR_CANNOT_OPEN_DEVICE; - goto bail_fd; - } - - err = psm3_gen1_init_userinfo_params(ep, - unit, - job_key, - &pc_private->user_info); - if (err) { - err = -PSM_HAL_ERROR_GENERAL_ERROR; - goto bail_fd; - } - - cpu_set_t mycpuset; - if (psm3_sysfs_get_unit_cpumask(unit, &mycpuset)) { - _HFI_ERROR( "Failed to get %s (unit %d) cpu set\n", ep->dev_name, unit); - //err = -PSM_HAL_ERROR_GENERAL_ERROR; - goto bail_fd; - } - - if (psm3_context_set_affinity(ep, mycpuset)) - goto bail_fd; - - /* attempt to assign the context via psm3_gen1_userinit_internal() - * and mmap the HW resources */ - int retry = 0; - do { - if (retry > 0) - _HFI_INFO("psm3_gen1_userinit_internal: failed, trying again (%d/%d)\n", - retry, retryCnt); - pc_private->ctrl = psm3_gen1_userinit_internal(fd, ep->skip_affinity, - &pc_private->user_info); - } while (pc_private->ctrl == NULL && ++retry <= retryCnt); - - if (!pc_private->ctrl) - { - err = -PSM_HAL_ERROR_CANNOT_OPEN_CONTEXT; - goto bail_fd; - } - else - { - - if (psm3_parse_identify()) { - printf("%s %s run-time driver interface v%d.%d\n", - psm3_get_mylabel(), psm3_ident_tag, - psm3_gen1_get_user_major_version(), - psm3_gen1_get_user_minor_version()); - } - - struct _hfi_ctrl *ctrl = pc_private->ctrl; - int i; - int lid; - - if ((lid = psm3_gen1_get_port_lid(ctrl->__hfi_unit, - ctrl->__hfi_port, addr_index, GEN1_FILTER)) <= 0) { - err = psm3_handle_error(NULL, - PSM2_EP_DEVICE_FAILURE, - "Can't get HFI LID in psm3_ep_open: is SMA running?"); - goto bail; - } - if (psm3_hfp_gen1_get_port_subnet(ctrl->__hfi_unit, ctrl->__hfi_port, addr_index, - &ep->subnet, &ep->addr, - NULL, &ep->gid) == -1) { - err = - psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, - "Can't get HFI GID in psm3_ep_open: is SMA running?"); - goto bail; - } - ep->unit_id = ctrl->__hfi_unit; - ep->portnum = ctrl->__hfi_port; - ep->addr_index = addr_index; - ep->dev_name = psm3_sysfs_unit_dev_name(ep->unit_id); - - /* Endpoint out_sl contains the default SL to use for this endpoint. */ - /* Get the MTU for this SL. */ - int sc; - if ((sc=psm3_gen1_get_port_sl2sc(ep->unit_id, - ctrl->__hfi_port, - ep->out_sl)) < 0) { - sc = PSMI_SC_DEFAULT; - } - int vl; - if ((vl = psm3_gen1_get_port_sc2vl(ep->unit_id, - ctrl->__hfi_port, - sc)) < 0) { - vl = PSMI_VL_DEFAULT; - } - if (sc == PSMI_SC_ADMIN || - vl == PSMI_VL_ADMIN) { - err = psm3_handle_error(NULL, PSM2_INTERNAL_ERR, - "Invalid sl: %d, please specify correct sl via PSM3_NIC_SL", - ep->out_sl); - goto bail; - } - - if ((ep->mtu = psm3_gen1_get_port_vl2mtu(ep->unit_id, - ctrl->__hfi_port, - vl)) < 0) { - err = - psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, - "Can't get MTU for VL %d", - vl); - goto bail; - } - - get_psm_gen1_hi()->phi.params.cap_mask |= - psm3_gen1_get_cap_mask(ctrl->ctxt_info.runtime_flags) - | PSM_HAL_CAP_MERGED_TID_CTRLS - | PSM_HAL_CAP_RSM_FECN_SUPP; - - int driver_major = psm3_gen1_get_user_major_version(); - int driver_minor = psm3_gen1_get_user_minor_version(); - - if ((driver_major > 6) || - ((driver_major == 6) && - (driver_minor >= 3))) - { - get_psm_gen1_hi()->phi.params.cap_mask |= PSM_HAL_CAP_DMA_HSUPP_FOR_32B_MSGS; - } - - get_psm_gen1_hi()->hfp_private.sdmahdr_req_size = HFI_SDMA_HDR_SIZE; - - if (psm3_gen1_check_non_dw_mul_sdma()) - get_psm_gen1_hi()->phi.params.cap_mask |= PSM_HAL_CAP_NON_DW_MULTIPLE_MSG_SIZE; - /* The dma_rtail member is: 1 when the HFI1_CAP_DMA_RTAIL bit is set. - 0 when the HFI1_CAP_DMA_RTAIL bit is NOT set. */ - get_psm_gen1_hi()->hfp_private.dma_rtail = 0 != (HFI1_CAP_DMA_RTAIL & ctrl->ctxt_info.runtime_flags); - - psm_ctxt->psm_hw_ctxt = pc_private; - if (pc_private->user_info.subctxt_cnt > 0) - psm3_gen1_divvy_shared_mem_ptrs(pc_private, - psm_ctxt, - &ctrl->base_info); - - /* Initialize all of the cl q's. */ - - get_psm_gen1_hi()->hfp_private.hdrq_rhf_off = (ctrl->ctxt_info.rcvhdrq_entsize - 8) >> BYTE2DWORD_SHIFT; - - /* The following guard exists to workaround a critical issue flagged by KW to prevent - subscripting past the end of the cl_qs[] array in the following for () loop. */ - if (pc_private->user_info.subctxt_cnt <= HFI1_MAX_SHARED_CTXTS) - { - /* Here, we are initializing only the rx hdrq rhf seq for all subcontext - cl q's: */ - for (i=PSM3_GEN1_CL_Q_RX_HDR_Q_SC_0; i < - PSM3_GEN1_GET_SC_CL_Q_RX_HDR_Q(pc_private->user_info.subctxt_cnt); i += 2) - { - psm3_gen1_cl_q_t *pcl_q = &(pc_private->cl_qs[i]); - - pcl_q->hdr_qe.p_rx_hdrq_rhf_seq = &pcl_q->hdr_qe.rx_hdrq_rhf_seq; - if (get_psm_gen1_hi()->hfp_private.dma_rtail) - pcl_q->hdr_qe.rx_hdrq_rhf_seq = 0; - else - pcl_q->hdr_qe.rx_hdrq_rhf_seq = 1; - } - } - /* Next, initialize the hw rx hdr q and egr buff q: */ - { - /* base address of user registers */ - volatile uint64_t *uregbase = (volatile uint64_t *)(uintptr_t) (ctrl->base_info.user_regbase); - /* hw rx hdr q: */ - psm3_gen1_cl_q_t *pcl_q = &(pc_private->cl_qs[PSM3_GEN1_CL_Q_RX_HDR_Q]); - pcl_q->cl_q_head = (volatile uint64_t *)&(uregbase[ur_rcvhdrhead]); - pcl_q->cl_q_tail = (volatile uint64_t *)&(uregbase[ur_rcvhdrtail]); - pcl_q->hdr_qe.hdrq_base_addr = (uint32_t *) (ctrl->base_info.rcvhdr_bufbase); - - /* Initialize the ptr to the rx hdrq rhf seq: */ - if (pc_private->user_info.subctxt_cnt > 0) - /* During sharing of a context, the h/w hdrq rhf_seq is placed in shared memory and is shared - by all subcontexts: */ - pcl_q->hdr_qe.p_rx_hdrq_rhf_seq = &pc_private->hwcontext_ctrl->rx_hdrq_rhf_seq; - else - pcl_q->hdr_qe.p_rx_hdrq_rhf_seq = &pcl_q->hdr_qe.rx_hdrq_rhf_seq; - - if (get_psm_gen1_hi()->hfp_private.dma_rtail) - *pcl_q->hdr_qe.p_rx_hdrq_rhf_seq = 0; - else - *pcl_q->hdr_qe.p_rx_hdrq_rhf_seq = 1; - /* hw egr buff q: */ - pcl_q = &pc_private->cl_qs[PSM3_GEN1_CL_Q_RX_EGR_Q]; - pcl_q->cl_q_head = (volatile uint64_t *)&(uregbase[ur_rcvegrindexhead]); - pcl_q->cl_q_tail = (volatile uint64_t *)&(uregbase[ur_rcvegrindextail]); - pcl_q->egr_buffs = psm3_ips_recvq_egrbuf_table_alloc(ep, - (void*)(ctrl->base_info.rcvegr_bufbase), - ctrl->ctxt_info.egrtids, - ctrl->ctxt_info.rcvegr_size); - } - /* Next, initialize the subcontext's rx hdr q and egr buff q: */ - for (i=0; i < pc_private->user_info.subctxt_cnt;i++) - { - /* Subcontexts mimic the HW registers but use different addresses - * to avoid cache contention. */ - volatile uint64_t *subcontext_uregbase; - uint32_t *rcv_hdr, *rcv_egr; - unsigned hdrsize, egrsize; - unsigned pagesize = getpagesize(); - uint32_t subcontext = i; - unsigned i = pagesize - 1; - hdrsize = - (ctrl->ctxt_info.rcvhdrq_cnt * ctrl->ctxt_info.rcvhdrq_entsize + i) & ~i; - egrsize = - (ctrl->ctxt_info.egrtids * ctrl->ctxt_info.rcvegr_size + i) & ~i; - - subcontext_uregbase = (uint64_t *) - (((uintptr_t) (ctrl->base_info.subctxt_uregbase)) + - (sizeof(struct gen1_ips_subcontext_ureg) * subcontext)); - { - struct gen1_ips_subcontext_ureg *pscureg = (struct gen1_ips_subcontext_ureg *)subcontext_uregbase; - - if (subcontext == ctrl->ctxt_info.subctxt) - { - memset(pscureg, 0, sizeof(*pscureg)); - if (get_psm_gen1_hi()->hfp_private.dma_rtail) - pscureg->writeq_state.hdrq_rhf_seq = 0; - else - pscureg->writeq_state.hdrq_rhf_seq = 1; - } - } - - rcv_hdr = (uint32_t *) - (((uintptr_t) (ctrl->base_info.subctxt_rcvhdrbuf)) + - (hdrsize * subcontext)); - rcv_egr = (uint32_t *) - (((uintptr_t) ctrl->base_info.subctxt_rcvegrbuf + - (egrsize * subcontext))); - - /* rx hdr q: */ - psm3_gen1_cl_q_t *pcl_q = &(pc_private->cl_qs[PSM3_GEN1_GET_SC_CL_Q_RX_HDR_Q(subcontext)]); - pcl_q->hdr_qe.hdrq_base_addr = rcv_hdr; - pcl_q->cl_q_head = (volatile uint64_t *)&subcontext_uregbase[ur_rcvhdrhead * 8]; - pcl_q->cl_q_tail = (volatile uint64_t *)&subcontext_uregbase[ur_rcvhdrtail * 8]; - - /* egr q: */ - pcl_q = &(pc_private->cl_qs[PSM3_GEN1_GET_SC_CL_Q_RX_EGR_Q(subcontext)]); - pcl_q->cl_q_head = (volatile uint64_t *)&subcontext_uregbase[ur_rcvegrindexhead * 8]; - pcl_q->cl_q_tail = (volatile uint64_t *)&subcontext_uregbase[ur_rcvegrindextail * 8]; - pcl_q->egr_buffs = psm3_ips_recvq_egrbuf_table_alloc( - ep, - (void*)rcv_egr, - ctrl->ctxt_info.egrtids, - ctrl->ctxt_info.rcvegr_size); - } - - /* Construct epid for this Endpoint */ - ep->epid = psm_ctxt->epid = psm3_epid_pack_ips(lid, ctrl->ctxt_info.ctxt, - ctrl->ctxt_info.subctxt, ep->unit_id, - ep->addr); - - _HFI_VDBG("construct epid v%u: %s: lid %d ctxt %d subctxt %d hcatype %d addr %s mtu %d\n", - ep->addr.fmt, - psm3_epid_fmt_internal(ep->epid, 0), lid, - ctrl->ctxt_info.ctxt, ctrl->ctxt_info.subctxt, - PSMI_HFI_TYPE_OPA1, - psm3_naddr128_fmt(ep->addr, 1), ep->mtu); - } - ep->wiremode = 0; // Only 1 mode for OPA - ep->context.ep = ep; - return PSM_HAL_ERROR_OK; - - /* no failure possible after alloc egr_buffs */ - //psm3_gen1_free_egr_buffs(pc_private); -bail: - /* Unmap the HFI memory mapped by userinit_internal */ - psm3_gen1_unmap_hfi_mem(pc_private); -bail_fd: - if (fd >0) close(fd); - if (pc_private) { - if (pc_private->ctrl) free(pc_private->ctrl); - psmi_free(pc_private); - psm_ctxt->psm_hw_ctxt = NULL; - } - - return -PSM_HAL_ERROR_GENERAL_ERROR; -} - -static PSMI_HAL_INLINE int psm3_hfp_gen1_get_port_index2pkey(psm2_ep_t ep, int index) -{ - return psm3_gen1_get_port_index2pkey(ep->unit_id, ep->portnum, index); -} - -static PSMI_HAL_INLINE int psm3_hfp_gen1_set_pkey(psmi_hal_hw_context ctxt, uint16_t pkey) -{ - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - return psm3_gen1_set_pkey(psm_hw_ctxt->ctrl, pkey); -} - -/* Tell the driver to change the way packets can generate interrupts. - - HFI1_POLL_TYPE_URGENT: Generate interrupt only when send with - IPS_SEND_FLAG_INTR (HFI_KPF_INTR) - HFI1_POLL_TYPE_ANYRCV: wakeup on any rcv packet (when polled on). [not used] - - PSM: Uses TYPE_URGENT in ips protocol -*/ - -static PSMI_HAL_INLINE int psm3_hfp_gen1_poll_type(uint16_t poll_type, psm2_ep_t ep) -{ - if (poll_type == PSMI_HAL_POLL_TYPE_URGENT) - poll_type = HFI1_POLL_TYPE_URGENT; - else - poll_type = 0; - hfp_gen1_pc_private *psm_hw_ctxt = ep->context.psm_hw_ctxt; - return psm3_gen1_poll_type(psm_hw_ctxt->ctrl, poll_type); -} - -// initialize HAL specific parts of ptl_ips -// This is called after most of the generic aspects have been initialized -// so we can use ptl->ep, ptl->ctl, etc as needed -// However it is called prior to ips_proto_init. ips_proto_init requires some -// ips_ptl items such as ptl->spioc -static PSMI_HAL_INLINE psm2_error_t psm3_hfp_gen1_ips_ptl_init_pre_proto_init(struct ptl_ips *ptl) -{ - return psm3_gen1_ips_ptl_init_pre_proto_init(ptl); -} - -// initialize HAL specific parts of ptl_ips -// This is called after after ips_proto_init and after most of the generic -// aspects of ips_ptl have been initialized -// so we can use ptl->ep and ptl->proto as needed -static PSMI_HAL_INLINE psm2_error_t psm3_hfp_gen1_ips_ptl_init_post_proto_init(struct ptl_ips *ptl) -{ - return psm3_gen1_ips_ptl_init_post_proto_init(ptl); -} - -// finalize HAL specific parts of ptl_ips -// This is called before the generic aspects have been finalized -// but after ips_proto has been finalized -// so we can use ptl->ep as needed -static PSMI_HAL_INLINE psm2_error_t psm3_hfp_gen1_ips_ptl_fini(struct ptl_ips *ptl) -{ - return psm3_gen1_ips_ptl_fini(ptl); -} - -// initialize HAL specific details in ips_proto. -// called after many of ips_proto parameters parsed and initialized -static PSMI_HAL_INLINE psm2_error_t psm3_hfp_gen1_ips_proto_init( - struct ips_proto *proto, uint32_t cksum_sz) -{ - psm2_error_t err = PSM2_OK; - hfp_gen1_pc_private *psm_hw_ctxt = proto->ep->context.psm_hw_ctxt; - struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; - union psmi_envvar_val env_mtu; - - // defaults for SDMA thresholds. These may be updated when - // PSM3_* env for SDMA are parsed later in psm3_ips_proto_init. - if(psm3_cpu_model == CPUID_MODEL_PHI_GEN2 || psm3_cpu_model == CPUID_MODEL_PHI_GEN2M) - { - proto->iovec_thresh_eager = 65536; - proto->iovec_thresh_eager_blocking = 200000; - } else { - proto->iovec_thresh_eager = 16384; - proto->iovec_thresh_eager_blocking = 34000; - } - - // set basic HW info, some of which is used for dispersive routing hash - proto->epinfo.ep_baseqp = ctrl->base_info.bthqp; - proto->epinfo.ep_context = ctrl->ctxt_info.ctxt; /* "real" context */ - proto->epinfo.ep_hash = proto->epinfo.ep_context; - proto->epinfo.ep_subcontext = ctrl->ctxt_info.subctxt; - proto->epinfo.ep_hfi_type = PSMI_HFI_TYPE_OPA1; - proto->epinfo.ep_jkey = psm3_hfp_gen1_get_jkey(proto->ep); - - // at this point ep->mtu is our PSM payload HW capability found during - // open (not yet adjusted for optional cksum_sz) - - /* See if user specifies a lower MTU to use */ - if (!psm3_getenv("PSM3_MTU", - "Upper bound on packet MTU (<=0 uses port MTU): 1-7,256,512,1024,2048,4096,8192,10240]", - PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, - (union psmi_envvar_val)-1, &env_mtu)) { - if (env_mtu.e_int >= OPA_MTU_MIN && env_mtu.e_int <= OPA_MTU_MAX) //enum - env_mtu.e_int = opa_mtu_enum_to_int((enum opa_mtu)env_mtu.e_int); - else if (env_mtu.e_int < OPA_MTU_MIN) // pick default - env_mtu.e_int = 8192; - else // wash through enum to force round up to next valid MTU - env_mtu.e_int = opa_mtu_enum_to_int(opa_mtu_int_to_enum(env_mtu.e_int)); - if (proto->ep->mtu > env_mtu.e_int) - proto->ep->mtu = env_mtu.e_int; - } - /* allow space for optional software managed checksum (for debug) */ - proto->ep->mtu -= cksum_sz; - // ep->mtu is our final choice of local PSM payload we can support - proto->epinfo.ep_mtu = proto->ep->mtu; - -#ifdef PSM_BYTE_FLOW_CREDITS - // for OPA we let flow_credits be the control - proto->flow_credit_bytes = proto->ep->mtu * proto->flow_credits; -#endif - /* - * The PIO size should not include the ICRC because it is - * stripped by HW before delivering to receiving buffer. - * We decide to use minimum 2 PIO buffers so that PSM has - * turn-around time to do PIO transfer. Each credit is a - * block of 64 bytes. Also PIO buffer size must not be - * bigger than MTU. - */ - proto->epinfo.ep_piosize = psmi_hal_get_pio_size(psm_hw_ctxt) - cksum_sz; - proto->epinfo.ep_piosize = - min(proto->epinfo.ep_piosize, proto->epinfo.ep_mtu); - - /* Keep PIO as multiple of cache line size */ - if (proto->epinfo.ep_piosize > PSM_CACHE_LINE_BYTES) - proto->epinfo.ep_piosize &= ~(PSM_CACHE_LINE_BYTES - 1); - - /* Save back to hfi level. */ - ctrl->__hfi_mtusize = proto->epinfo.ep_mtu; - ctrl->__hfi_piosize = proto->epinfo.ep_piosize; - - /* sdma queue size */ - proto->sdma_queue_size = psm3_gen1_get_sdma_ring_size(psm_hw_ctxt); - /* don't use the last slot */ - if (proto->sdma_queue_size > 8) { - /* configure sdma_avail_counter */ - union psmi_envvar_val env_sdma_avail; - int tmp_queue_size = 8; - - psm3_getenv("PSM3_MAX_PENDING_SDMA_REQS", - "PSM maximum pending SDMA requests", - PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, - (union psmi_envvar_val) tmp_queue_size, - &env_sdma_avail); - - if ((env_sdma_avail.e_int < 8) || (env_sdma_avail.e_int > (proto->sdma_queue_size - 1))) - proto->sdma_avail_counter = 8; - else - proto->sdma_avail_counter = env_sdma_avail.e_int; - } else { - err = PSM2_PARAM_ERR; - goto fail; - } - - - proto->sdma_fill_index = 0; - proto->sdma_done_index = 0; - proto->sdma_scb_queue = (struct ips_scb **) - psmi_calloc(proto->ep, UNDEFINED, - proto->sdma_queue_size, sizeof(struct ips_scb *)); - if (proto->sdma_scb_queue == NULL) { - err = PSM2_NO_MEMORY; - goto fail; - } - - /* - * Pre-calculate the PSN mask to support 24 or 31 bit PSN. - */ - if (psmi_hal_has_cap(PSM_HAL_CAP_EXTENDED_PSN)) { - proto->psn_mask = 0x7FFFFFFF; - } else { - proto->psn_mask = 0xFFFFFF; - } - /* 12 bit pktlen (limit to <= 4095 32 bit words per packet */ - proto->pktlen_mask = 0xFFF; -fail: - return err; -} - -// Fetch current link state to update linkinfo fields in ips_proto: -// ep_base_lid, ep_lmc, ep_link_rate, QoS tables, CCA tables -// These are all fields which can change during a link bounce. -// Note "active" state is not adjusted as on link down PSM will wait for -// the link to become usable again so it's always a viable/active device -// afer initial PSM startup has selected devices. -// Called during initialization of ips_proto during ibta_init as well -// as during a link bounce. -// TBD - may be able to call this from HAL ips_proto_init as well as -// directly within HAL event processing, in which case this could -// be completely internal to HAL and not exposed in HAL API -static PSMI_HAL_INLINE psm2_error_t psm3_hfp_gen1_ips_proto_update_linkinfo( - struct ips_proto *proto) -{ - return psm3_gen1_ptl_ips_update_linkinfo(proto); -} - -// Indicate if all underlying connections are now established -// (eg. RV connections) -// return: -// 0 - not yet connected -// 1 - connected (or nothing extra needed) -// -1 - failure to check or connect (errno is status) -// EIO is connection error other values are more serious -// (invalid call, etc) -static PSMI_HAL_INLINE int psm3_hfp_gen1_ips_fully_connected(ips_epaddr_t *ipsaddr) -{ - return 1; -} - -/* handle HAL specific connection processing as part of processing an - * inbound PSM connect Request or Reply when connection not yet established - * save the negotiated parameters - */ -static PSMI_HAL_INLINE psm2_error_t psm3_hfp_gen1_ips_ipsaddr_set_req_params( - struct ips_proto *proto, - ips_epaddr_t *ipsaddr, - const struct ips_connect_reqrep *req) -{ - return PSM2_OK; -} - -/* handle HAL specific connection processing as part of processing an - * inbound PSM connect Reply which completes establishment of on outgoing - * connection. - */ -static PSMI_HAL_INLINE psm2_error_t psm3_hfp_gen1_ips_ipsaddr_process_connect_reply( - struct ips_proto *proto, - ips_epaddr_t *ipsaddr, - const struct ips_connect_reqrep *req) -{ - return PSM2_OK; -} - -/* build HAL specific portion of an outbound PSM connect message - * for PSM Connect or Disconnect Request or Reply - */ -static PSMI_HAL_INLINE void psm3_hfp_gen1_ips_proto_build_connect_message( - struct ips_proto *proto, - ips_epaddr_t *ipsaddr, uint8_t opcode, - struct ips_connect_reqrep *req) -{ - switch (opcode) { - case OPCODE_CONNECT_REPLY: - case OPCODE_CONNECT_REQUEST: - memset(req->hal_pad, 0, sizeof(req->hal_pad)); - break; - case OPCODE_DISCONNECT_REQUEST: - case OPCODE_DISCONNECT_REPLY: - // placeholder, but typically nothing to be done - // as the ips_connect_hdr is sufficient - break; - default: - psmi_assert_always(0); - break; - } -} - -/* handle HAL specific ipsaddr initialization for addressing, including - * parts of ipsaddr needed for path record query - * For ipsaddr created just for a disconnect, ips_ipsaddr_init_connections - * is not called. In which case ips_ipsaddr_init_addressing and ips_flow_init - * need to do what is needed to allow spio_transfer_frame to send the - * disconnect control packet. - */ -static PSMI_HAL_INLINE void psm3_hfp_gen1_ips_ipsaddr_init_addressing( - struct ips_proto *proto, psm2_epid_t epid, - ips_epaddr_t *ipsaddr, uint16_t *lidp - ) -{ - /* Actual context of peer */ - ipsaddr->opa.context = psm3_epid_context(epid); - /* Subcontext */ - ipsaddr->opa.subcontext = psm3_epid_subcontext(epid); - ipsaddr->hash = ipsaddr->opa.context; - - // for OPA, just need lid - *lidp = psm3_epid_lid(epid); -} - -/* handle HAL specific ipsaddr initialization for any HAL specific connections - * underlying the ipsaddr (RC QPs, TCP sockets, etc) - * This is not called for an ipsaddr created just for a disconnect. In which - * case ips_ipsaddr_init_addressing and ips_flow_init need to do what is - * needed to allow spio_transfer_frame to send the disconnect control packet. - */ -static PSMI_HAL_INLINE psm2_error_t psm3_hfp_gen1_ips_ipsaddr_init_connections( - struct ips_proto *proto, psm2_epid_t epid, - ips_epaddr_t *ipsaddr) -{ - return PSM2_OK; -} - -/* handle HAL specific ipsaddr free for any HAL specific information - * in ipsaddr (from ipsaddr_init_*, set_req_params, etc - */ -static PSMI_HAL_INLINE void psm3_hfp_gen1_ips_ipsaddr_free( - ips_epaddr_t *ipsaddr, struct ips_proto *proto) -{ -} - -/* handle HAL specific ips_flow initialization - */ -static PSMI_HAL_INLINE void psm3_hfp_gen1_ips_flow_init( - struct ips_flow *flow, struct ips_proto *proto) -{ - if (flow->transfer == PSM_TRANSFER_PIO) { - flow->flush = psm3_ips_proto_flow_flush_pio; - } else { - flow->flush = ips_proto_flow_flush_dma; - } - - /* if PIO, need to consider local pio buffer size */ - if (flow->transfer == PSM_TRANSFER_PIO) { - flow->frag_size = min(flow->frag_size, proto->epinfo.ep_piosize); - _HFI_CONNDBG("[ipsaddr=%p] PIO flow->frag_size: %u = min(" - "proto->epinfo.ep_mtu(%u), flow->path->pr_mtu(%u), proto->epinfo.ep_piosize(%u))\n", - flow->ipsaddr, flow->frag_size, proto->epinfo.ep_mtu, - flow->path->pr_mtu, proto->epinfo.ep_piosize); - } else { - _HFI_CONNDBG("[ipsaddr=%p] SDMA flow->frag_size: %u = min(" - "proto->epinfo.ep_mtu(%u), flow->path->pr_mtu(%u))\n", - flow->ipsaddr, flow->frag_size, proto->epinfo.ep_mtu, - flow->path->pr_mtu); - } - - flow->cca_ooo_pkts = 0; -} - -/* handle HAL specific connection processing as part of processing an - * outbound PSM disconnect Request or Reply or an inbound disconnect request - */ -static PSMI_HAL_INLINE void psm3_hfp_gen1_ips_ipsaddr_disconnect( - struct ips_proto *proto, ips_epaddr_t *ipsaddr) -{ -} - -/* Handle HAL specific initialization of ibta path record query, CCA - * and dispersive routing - */ -static PSMI_HAL_INLINE psm2_error_t psm3_hfp_gen1_ips_ibta_init( - struct ips_proto *proto) -{ - psm2_error_t err = PSM2_OK; - union psmi_envvar_val psm_path_policy; - union psmi_envvar_val disable_cca; - union psmi_envvar_val cca_prescan; - - /* Get the path selection policy */ - psm3_getenv("PSM3_PATH_SELECTION", - "Policy to use if multiple paths are available between endpoints. Options are adaptive, static_src, static_dest, static_base. Default is adaptive.", - PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, - (union psmi_envvar_val)"adaptive", &psm_path_policy); - - if (!strcasecmp((const char *)psm_path_policy.e_str, "adaptive")) - proto->flags |= IPS_PROTO_FLAG_PPOLICY_ADAPTIVE; - else if (!strcasecmp((const char *)psm_path_policy.e_str, "static_src")) - proto->flags |= IPS_PROTO_FLAG_PPOLICY_STATIC_SRC; - else if (!strcasecmp - ((const char *)psm_path_policy.e_str, "static_dest")) - proto->flags |= IPS_PROTO_FLAG_PPOLICY_STATIC_DST; - else if (!strcasecmp - ((const char *)psm_path_policy.e_str, "static_base")) - proto->flags |= IPS_PROTO_FLAG_PPOLICY_STATIC_BASE; - - if (proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE) - _HFI_PRDBG("Using adaptive path selection.\n"); - if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_SRC) - _HFI_PRDBG("Static path selection: Src Context\n"); - if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_DST) - _HFI_PRDBG("Static path selection: Dest Context\n"); - if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_BASE) - _HFI_PRDBG("Static path selection: Base LID\n"); - - psm3_getenv("PSM3_DISABLE_CCA", - "Disable use of Congestion Control Architecture (CCA) [enabled] ", - PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, - (union psmi_envvar_val)0, &disable_cca); - if (disable_cca.e_uint) - _HFI_CCADBG("CCA is disabled for congestion control.\n"); - else { - int i; - char ccabuf[256]; - uint8_t *p; - - /* Start out by turning on both styles of congestion control. - * Later, we will eliminate the correct one. */ - proto->flags |= IPS_PROTO_FLAG_CCA | IPS_PROTO_FLAG_CC_REPL_BECN; -/* - * If user set any environment variable, use self CCA. - */ - if (getenv("PSM3_CCTI_INCREMENT") || getenv("PSM3_CCTI_TIMER") - || getenv("PSM3_CCTI_TABLE_SIZE")) { - goto disablecca; - } - - psm3_getenv("PSM3_CCA_PRESCAN", - "Enable Congestion Control Prescanning (disabled by default) ", - PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, - (union psmi_envvar_val)0, &cca_prescan); - - if (cca_prescan.e_uint) - proto->flags |= IPS_PROTO_FLAG_CCA_PRESCAN; - -/* - * Check qib driver CCA setting, and try to use it if available. - * Fall to self CCA setting if errors. - */ - i = psm3_gen1_get_cc_settings_bin(proto->ep->unit_id, - proto->ep->portnum, ccabuf, sizeof(ccabuf)); - - if (i <= 0) { - goto disablecca; - } - p = (uint8_t *) ccabuf; - memcpy(&proto->ccti_ctrlmap, p, 4); - p += 4; - memcpy(&proto->ccti_portctrl, p, 2); - p += 2; - for (i = 0; i < 32; i++) { - proto->cace[i].ccti_increase = *p; - p++; - /* skip reserved u8 */ - p++; - memcpy(&proto->cace[i].ccti_timer_cycles, p, 2); - p += 2; - proto->cace[i].ccti_timer_cycles = - us_2_cycles(proto->cace[i].ccti_timer_cycles); - proto->cace[i].ccti_threshold = *p; - p++; - proto->cace[i].ccti_min = *p; - p++; - } - - i = psm3_gen1_get_cc_table_bin(proto->ep->unit_id, proto->ep->portnum, - &proto->cct); - if (i < 0) { - err = PSM2_NO_MEMORY; - goto fail; - } else if (i == 0) { - goto disablecca; - } - proto->ccti_limit = i; - proto->ccti_size = proto->ccti_limit + 1; - - _HFI_CCADBG("ccti_limit = %d\n", (int) proto->ccti_limit); - for (i = 0; i < proto->ccti_limit; i++) - _HFI_CCADBG("cct[%d] = 0x%04x\n", i, (int) proto->cct[i]); - - /* Note, here, we are leaving CC style(s): - (IPS_PROTO_FLAG_CCA | IPS_PROTO_FLAG_CCA_PRESCAN) */ - proto->flags &= ~IPS_PROTO_FLAG_CC_REPL_BECN; - goto finishcca; - -/* - * Disable CCA. - */ -disablecca: - /* Note, here, we are leaving CC style: - IPS_PROTO_FLAG_CC_REPL_BECN */ - proto->flags &= ~(IPS_PROTO_FLAG_CCA | IPS_PROTO_FLAG_CCA_PRESCAN); - } - -finishcca: -fail: - return err; - -} - -/* Handle HAL specific initialization of an ips_path_rec - * as part of fetching or hand building a path record. - * Responsible for all fields in the HAL specific union and any tweaks to - * other fields which may be HAL specific (such as pr_mtu). - * response is only provided when we are building a ips_path_rec from a - * fetched ibta_path_rec. Otherwise we are building it solely based on - * our own end point and what our caller knows from the EPID. - */ -static PSMI_HAL_INLINE psm2_error_t psm3_hfp_gen1_ips_path_rec_init( - struct ips_proto *proto, - struct ips_path_rec *path_rec, - struct _ibta_path_rec *response) -{ - psm2_error_t err = PSM2_OK; - /* Setup CCA parameters for path */ - if (!(proto->ccti_ctrlmap & (1 << path_rec->pr_sl))) { - _HFI_CCADBG("No CCA for sl %d, disable CCA\n", - path_rec->pr_sl); - proto->flags &= ~IPS_PROTO_FLAG_CCA; - proto->flags &= ~IPS_PROTO_FLAG_CCA_PRESCAN; - } - if (!psmi_hal_has_cap(PSM_HAL_CAP_STATIC_RATE_CTRL)) { - _HFI_CCADBG("No Static-Rate-Control, disable CCA\n"); - proto->flags &= ~IPS_PROTO_FLAG_CCA; - proto->flags &= ~IPS_PROTO_FLAG_CCA_PRESCAN; - } - - path_rec->opa.pr_proto = proto; - path_rec->opa.pr_ccti = proto->cace[path_rec->pr_sl].ccti_min; - path_rec->opa.pr_timer_cca = NULL; - - /* Determine active IPD for path. Is max of static rate and CCT table */ - if (!(proto->flags & IPS_PROTO_FLAG_CCA)) { - _HFI_CCADBG("No IPS_PROTO_FLAG_CCA\n"); - - path_rec->opa.pr_active_ipd = 0; - path_rec->opa.pr_cca_divisor = 0; - - _HFI_CCADBG("pr_active_ipd = %d\n", (int) path_rec->opa.pr_active_ipd); - _HFI_CCADBG("pr_cca_divisor = %d\n", (int) path_rec->opa.pr_cca_divisor); - } else if ((path_rec->pr_static_ipd) && - ((path_rec->pr_static_ipd + 1) > - (proto->cct[path_rec->opa.pr_ccti] & CCA_IPD_MASK))) { - _HFI_CCADBG("IPS_PROTO_FLAG_CCA set, Setting pr_active_ipd.\n"); - - path_rec->opa.pr_active_ipd = path_rec->pr_static_ipd + 1; - path_rec->opa.pr_cca_divisor = 0; - - _HFI_CCADBG("pr_active_ipd = %d\n", (int) path_rec->opa.pr_active_ipd); - _HFI_CCADBG("pr_cca_divisor = %d\n", (int) path_rec->opa.pr_cca_divisor); - } else { - /* Pick it from the CCT table */ - _HFI_CCADBG("Picking up active IPD from CCT table, index %d, value 0x%x\n", - (int) path_rec->opa.pr_ccti, (int) proto->cct[path_rec->opa.pr_ccti]); - - path_rec->opa.pr_active_ipd = - proto->cct[path_rec->opa.pr_ccti] & CCA_IPD_MASK; - path_rec->opa.pr_cca_divisor = - proto->cct[path_rec->opa.pr_ccti] >> CCA_DIVISOR_SHIFT; - - _HFI_CCADBG("pr_active_ipd = %d\n", (int) path_rec->opa.pr_active_ipd); - _HFI_CCADBG("pr_cca_divisor = %d\n", (int) path_rec->opa.pr_cca_divisor); - } - return err; -} - -static PSMI_HAL_INLINE psm2_error_t psm3_hfp_gen1_ips_ptl_pollintr( - psm2_ep_t ep, struct ips_recvhdrq *recvq, - int fd_pipe, int next_timeout, - uint64_t *pollok, uint64_t *pollcyc) -{ - return psm3_gen1_ips_ptl_pollintr(ep, recvq, fd_pipe, - next_timeout, pollok, pollcyc); -} - -#ifdef PSM_CUDA -static PSMI_HAL_INLINE void psm3_hfp_gen1_gdr_close(void) -{ - psm3_gen1_gdr_close(); -} -static PSMI_HAL_INLINE void* psm3_hfp_gen1_gdr_convert_gpu_to_host_addr(unsigned long buf, - size_t size, int flags, psm2_ep_t ep) -{ - return psm3_gen1_gdr_convert_gpu_to_host_addr(buf, size, flags, ep); -} -#endif /* PSM_CUDA */ - -static PSMI_HAL_INLINE int psm3_hfp_gen1_free_tid(psmi_hal_hw_context ctxt, uint64_t tidlist, uint32_t tidcnt) -{ - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - return psm3_gen1_free_tid(psm_hw_ctxt->ctrl, tidlist, tidcnt); -} - -static PSMI_HAL_INLINE int psm3_hfp_gen1_get_tidcache_invalidation(psmi_hal_hw_context ctxt, uint64_t tidlist, uint32_t *tidcnt) -{ - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - return psm3_gen1_get_invalidation(psm_hw_ctxt->ctrl, tidlist, tidcnt); -} - -static PSMI_HAL_INLINE int psm3_hfp_gen1_update_tid(psmi_hal_hw_context ctxt, uint64_t vaddr, uint32_t *length, - uint64_t tidlist, uint32_t *tidcnt, uint16_t flags) -{ - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - - return psm3_gen1_update_tid(psm_hw_ctxt->ctrl, vaddr, length, tidlist, tidcnt, flags); -} - -static PSMI_HAL_INLINE int psm3_hfp_gen1_get_hfi_event_bits(uint64_t *event_bits, psmi_hal_hw_context ctxt) -{ - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; - uint64_t *pevents_mask = (uint64_t *)ctrl->base_info.events_bufbase; - uint64_t events_mask = *pevents_mask; - uint64_t hal_hfi_event_bits = 0; - int i; - - if (!events_mask) - { - *event_bits = 0; - return PSM_HAL_ERROR_OK; - } - - /* Encode hfi1_events as HAL event codes here */ - for (i = 0; i < sizeof(hfi1_events_map)/sizeof(hfi1_events_map[0]); i++) - { - if (events_mask & hfi1_events_map[i].hfi1_event_bit) - hal_hfi_event_bits |= - hfi1_events_map[i].psmi_hal_hfi_event_bit; - } - - *event_bits = hal_hfi_event_bits; - - return PSM_HAL_ERROR_OK; -} - -static PSMI_HAL_INLINE int psm3_hfp_gen1_tidflow_set_entry(uint32_t flowid, uint32_t genval, uint32_t seqnum, psmi_hal_hw_context ctxt) -{ - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; - - psm3_gen1_tidflow_set_entry(ctrl, flowid, genval, seqnum); - return PSM_HAL_ERROR_OK; -} - -static PSMI_HAL_INLINE int psm3_hfp_gen1_tidflow_reset(psmi_hal_hw_context ctxt, uint32_t flowid, uint32_t genval, uint32_t seqnum) -{ - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; - - psm3_gen1_tidflow_reset(ctrl, flowid, genval, seqnum); - return PSM_HAL_ERROR_OK; -} - -static PSMI_HAL_INLINE int psm3_hfp_gen1_tidflow_get(uint32_t flowid, uint64_t *ptf, psmi_hal_hw_context ctxt) -{ - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; - - *ptf = psm3_gen1_tidflow_get(ctrl, flowid); - return PSM_HAL_ERROR_OK; -} - -static PSMI_HAL_INLINE int psm3_hfp_gen1_tidflow_get_hw(uint32_t flowid, uint64_t *ptf, psmi_hal_hw_context ctxt) -{ - return psm3_hfp_gen1_tidflow_get(flowid, ptf, ctxt); -} - -static PSMI_HAL_INLINE int psm3_hfp_gen1_tidflow_get_seqnum(uint64_t val, uint32_t *pseqn) -{ - *pseqn = psm3_gen1_tidflow_get_seqnum(val); - return PSM_HAL_ERROR_OK; -} - -static PSMI_HAL_INLINE int psm3_hfp_gen1_tidflow_check_update_pkt_seq(void *vpprotoexp - /* actually a: - struct ips_protoexp *protoexp */, - psmi_seqnum_t sequence_num, - void *vptidrecvc - /* actually a: - struct ips_tid_recv_desc *tidrecvc */, - struct ips_message_header *p_hdr, - void (*ips_protoexp_do_tf_generr) - (void *vpprotoexp - /* actually a: - struct ips_protoexp *protoexp */, - void *vptidrecvc - /* actually a: - struct ips_tid_recv_desc *tidrecvc */, - struct ips_message_header *p_hdr), - void (*ips_protoexp_do_tf_seqerr) - (void *vpprotoexp - /* actually a: - struct ips_protoexp *protoexp */, - void *vptidrecvc - /* actually a: - struct ips_tid_recv_desc *tidrecvc */, - struct ips_message_header *p_hdr) - ) -{ - struct ips_protoexp *protoexp = (struct ips_protoexp *) vpprotoexp; - struct ips_tid_recv_desc *tidrecvc = (struct ips_tid_recv_desc *) vptidrecvc; - - if_pf(psmi_hal_has_sw_status(PSM_HAL_HDRSUPP_ENABLED)) { - /* Drop packet if generation number does not match. There - * is a window that before we program the hardware tidflow - * table with new gen/seq, hardware might receive some - * packets with the old generation. - */ - if (sequence_num.psn_gen != tidrecvc->tidflow_genseq.psn_gen) - { - PSM2_LOG_MSG("leaving"); - return PSM_HAL_ERROR_GENERAL_ERROR; - } - -#ifdef PSM_DEBUG - /* Check if new packet falls into expected seq range, we need - * to deal with wrap around of the seq value from 2047 to 0 - * because seq is only 11 bits. */ - int16_t seq_off = (int16_t)(sequence_num.psn_seq - - tidrecvc->tidflow_genseq.psn_seq); - if (seq_off < 0) - seq_off += 2048; /* seq is 11 bits */ - psmi_assert(seq_off < 1024); -#endif - /* NOTE: with RSM in use, we should not automatically update - * our PSN from the HFI's PSN. The HFI doesn't know about - * RSM interceptions. - */ - /* (DON'T!) Update the shadow tidflow_genseq */ - /* tidrecvc->tidflow_genseq.psn_seq = sequence_num.psn_seq + 1; */ - - } - /* Always check the sequence number if we get a header, even if SH. */ - if_pt(sequence_num.psn_num == tidrecvc->tidflow_genseq.psn_num) { - /* Update the shadow tidflow_genseq */ - tidrecvc->tidflow_genseq.psn_seq = sequence_num.psn_seq + 1; - - /* update the fake tidflow table with new seq, this is for - * seqerr and err_chk_gen processing to get the latest - * valid sequence number */ - psm3_hfp_gen1_tidflow_set_entry( - tidrecvc->rdescid._desc_idx, - tidrecvc->tidflow_genseq.psn_gen, - tidrecvc->tidflow_genseq.psn_seq, - tidrecvc->context->psm_hw_ctxt); - } else { - /* Generation mismatch */ - if (sequence_num.psn_gen != tidrecvc->tidflow_genseq.psn_gen) { - ips_protoexp_do_tf_generr(protoexp, - tidrecvc, p_hdr); - PSM2_LOG_MSG("leaving"); - return PSM_HAL_ERROR_GENERAL_ERROR; - } else { - /* Possible sequence mismatch error */ - /* First, check if this is a recoverable SeqErr - - * caused by a good packet arriving in a tidflow that - * has had a FECN bit set on some earlier packet. - */ - - /* If this is the first RSM packet, our own PSN state - * is probably old. Pull from the HFI if it has - * newer data. - */ - uint64_t tf; - psmi_seqnum_t tf_sequence_num; - - psm3_hfp_gen1_tidflow_get(tidrecvc->rdescid._desc_idx, &tf, - tidrecvc->context->psm_hw_ctxt); - psm3_hfp_gen1_tidflow_get_seqnum(tf, &tf_sequence_num.psn_val); - - if (tf_sequence_num.psn_val > tidrecvc->tidflow_genseq.psn_seq) - tidrecvc->tidflow_genseq.psn_seq = tf_sequence_num.psn_seq; - - /* Now re-check the sequence numbers. */ - if (sequence_num.psn_seq > tidrecvc->tidflow_genseq.psn_seq) { - /* It really was a sequence error. Restart. */ - ips_protoexp_do_tf_seqerr(protoexp, tidrecvc, p_hdr); - PSM2_LOG_MSG("leaving"); - return PSM_HAL_ERROR_GENERAL_ERROR; - } else { - /* False SeqErr. We can accept this packet. */ - if (sequence_num.psn_seq == tidrecvc->tidflow_genseq.psn_seq) - tidrecvc->tidflow_genseq.psn_seq++; - } - } - } - - return PSM_HAL_ERROR_OK; -} - -static PSMI_HAL_INLINE psm2_error_t psm3_hfp_gen1_spio_transfer_frame(struct ips_proto *proto, - struct ips_flow *flow, struct ips_scb *scb, - uint32_t *payload, uint32_t length, - uint32_t isCtrlMsg, uint32_t cksum_valid, - uint32_t cksum -#ifdef PSM_CUDA - , uint32_t is_cuda_payload -#endif - ) -{ - return psm3_gen1_spio_transfer_frame(proto, flow, scb, - payload, length, isCtrlMsg, - cksum_valid, cksum -#ifdef PSM_CUDA - , is_cuda_payload -#endif - ); -} - -static PSMI_HAL_INLINE psm2_error_t psm3_hfp_gen1_transfer_frame(struct ips_proto *proto, - struct ips_flow *flow, struct ips_scb *scb, - uint32_t *payload, uint32_t length, - uint32_t isCtrlMsg, uint32_t cksum_valid, - uint32_t cksum -#ifdef PSM_CUDA - , uint32_t is_cuda_payload -#endif - ) -{ - switch (flow->transfer) { - case PSM_TRANSFER_PIO: - return psm3_gen1_spio_transfer_frame(proto, flow, scb, - payload, length, isCtrlMsg, - cksum_valid, cksum -#ifdef PSM_CUDA - , is_cuda_payload -#endif - ); - break; - case PSM_TRANSFER_DMA: - return psm3_gen1_dma_transfer_frame(proto, flow, scb, - payload, length, cksum_valid, cksum); - break; - default: - return PSM2_INTERNAL_ERR; - break; - } -} - -static PSMI_HAL_INLINE psm2_error_t psm3_hfp_gen1_dma_send_pending_scbs(struct ips_proto *proto, - struct ips_flow *flow, struct ips_scb_pendlist *slist, - int *num_sent) -{ - return psm3_gen1_dma_send_pending_scbs(proto, flow, slist, num_sent); -} - -static PSMI_HAL_INLINE psm2_error_t psm3_hfp_gen1_drain_sdma_completions(struct ips_proto *proto) -{ - return psm3_gen1_dma_completion_update(proto); -} - -static PSMI_HAL_INLINE int psm3_hfp_gen1_get_node_id(int unit, int *nodep) -{ - int64_t node_id = psm3_sysfs_unit_read_node_s64(unit); - *nodep = (int)node_id; - if (node_id != -1) - return PSM_HAL_ERROR_OK; - else - return -PSM_HAL_ERROR_GENERAL_ERROR; -} - -static PSMI_HAL_INLINE int psm3_hfp_gen1_get_jkey(psm2_ep_t ep) -{ - hfp_gen1_pc_private *psm_hw_ctxt = ep->context.psm_hw_ctxt; - struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; - - return ctrl->base_info.jkey; -} - -static PSMI_HAL_INLINE int psm3_hfp_gen1_get_pio_size(psmi_hal_hw_context ctxt) -{ - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; - - return (ctrl->ctxt_info.credits / 2) * 64 - - (sizeof(struct ips_message_header) + HFI_PCB_SIZE_IN_BYTES); -} - -static PSMI_HAL_INLINE int psm3_hfp_gen1_get_subctxt(psmi_hal_hw_context ctxt) -{ - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; - - return ctrl->ctxt_info.subctxt; -} - -static PSMI_HAL_INLINE int psm3_hfp_gen1_get_subctxt_cnt(psmi_hal_hw_context ctxt) -{ - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - - return psm_hw_ctxt->user_info.subctxt_cnt; -} - -static PSMI_HAL_INLINE int psm3_hfp_gen1_get_tid_exp_cnt(psmi_hal_hw_context ctxt) -{ - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; - - return ctrl->__hfi_tidexpcnt; -} - -static PSMI_HAL_INLINE int psm3_hfp_gen1_get_pio_stall_cnt(psmi_hal_hw_context ctxt, uint64_t **pio_stall_cnt) -{ - if (!ctxt) - return -PSM_HAL_ERROR_GENERAL_ERROR; - - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - - *pio_stall_cnt = &psm_hw_ctxt->spio_ctrl.spio_num_stall_total; - - return PSM_HAL_ERROR_OK; -} -#endif /* PSM_OPA */ diff --git a/psm3/hal_gen1/gen1_hfi1_deprecated.h b/psm3/hal_gen1/gen1_hfi1_deprecated.h deleted file mode 100644 index 6f62324..0000000 --- a/psm3/hal_gen1/gen1_hfi1_deprecated.h +++ /dev/null @@ -1,183 +0,0 @@ -#ifdef PSM_OPA -/* - - This file is provided under a dual BSD/GPLv2 license. When using or - redistributing this file, you may do so under either license. - - GPL LICENSE SUMMARY - - Copyright(c) 2016 Intel Corporation. - - This program is free software; you can redistribute it and/or modify - it under the terms of version 2 of the GNU General Public License as - published by the Free Software Foundation. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - Contact Information: - Intel Corporation, www.intel.com - - BSD LICENSE - - Copyright(c) 2016 Intel Corporation. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -/* - - hfi1_deprecated_gen1.h - - Contains certain features of the hfi1 module that have been deprecated. - - These features may still need to be supported by the psm library for - reasons of backwards compatibility. - */ - -#ifndef __PSM_HAL_GEN1_HFI1_DEPRECATED_H__ - -#define __PSM_HAL_GEN1_HFI1_DEPRECATED_H__ - -/* First, include the current hfi1_user.h file: */ - -#include - -/* Determine if we need to define and declare deprecated - entities based on the IB_IOCTL_MAGIC macro. */ - -#if defined( IB_IOCTL_MAGIC ) - -/* The macro: PSM2_SUPPORT_IW_CMD_API is used to stipulate - adding compile-time support of either the ioctl() or write() - command interfaces to the driver. Note though that the - final decision whether to support this depends on factors - only known at runtime. */ -#define PSM2_SUPPORT_IW_CMD_API 1 -/* IOCTL_CMD_API_MODULE_MAJOR defines the first version of the hfi1 - * module that supports the ioctl() command interface. Prior to this - * (IOCTL_CMD_API_MODULE_MAJOR - 1 and smaller), the module used - * write() for the command interface. */ -#define IOCTL_CMD_API_MODULE_MAJOR 6 - -/* - * round robin contexts across HFIs, then - * ports; this is the default. - * This option spreads the HFI selection within the local socket. - * If it is preferred to spread job over over entire set of - * HFIs within the system, see ALG_ACROSS_ALL below. - */ -#define HFI1_ALG_ACROSS_DEP 0 - -/* - * use all contexts on an HFI (round robin - * active ports within), then next HFI - */ -#define HFI1_ALG_WITHIN_DEP 1 - -struct hfi1_cmd_deprecated { - __u32 type; /* command type */ - __u32 len; /* length of struct pointed to by add */ - __u64 addr; /* pointer to user structure */ -}; - -#define hfi1_cmd hfi1_cmd_deprecated - -#define HFI1_ALG_ACROSS HFI1_ALG_ACROSS_DEP -#define HFI1_ALG_WITHIN HFI1_ALG_WITHIN_DEP - -#else - -#define HFI1_SWMAJOR_SHIFT 16 - -#endif /* defined( IB_IOCTL_MAGIC )*/ - -#define HFI1_ALG_ACROSS_ALL_DEP 2 -#define HFI1_ALG_ACROSS_ALL HFI1_ALG_ACROSS_ALL_DEP - -/* Note that struct hfi1_user_info_dep declaration is identical to - the struct hfi1_user_info declaration from MAJOR version 5 of the - hfi1_user.h file. */ -struct hfi1_user_info_dep { - /* - * version of user software, to detect compatibility issues. - * Should be set to HFI1_USER_SWVERSION. - */ - __u32 userversion; - __u16 pad; - /* HFI selection algorithm, if unit has not selected */ - __u16 hfi1_alg; - /* - * If two or more processes wish to share a context, each process - * must set the subcontext_cnt and subcontext_id to the same - * values. The only restriction on the subcontext_id is that - * it be unique for a given node. - */ - __u16 subctxt_cnt; - __u16 subctxt_id; - /* 128bit UUID passed in by PSM. */ - __u8 uuid[16]; -}; - -/* - * We assume here that we have the hfi1_user.h file installed in the system path - * with the 'flags' field defined in struct sdma_req_info. (At least, when the - * user needs to run GPU workloads, this _should_ be the version of hfi1_user.h - * file installed by the IFS.) - */ -struct sdma_req_info_v6_3 { - /* - * bits 0-3 - version (currently unused) - * bits 4-7 - opcode (enum sdma_req_opcode) - * bits 8-15 - io vector count - */ - __u16 ctrl; - /* - * Number of fragments contained in this request. - * User-space has already computed how many - * fragment-sized packet the user buffer will be - * split into. - */ - __u16 npkts; - /* - * Size of each fragment the user buffer will be - * split into. - */ - __u16 fragsize; - /* - * Index of the slot in the SDMA completion ring - * this request should be using. User-space is - * in charge of managing its own ring. - */ - __u16 comp_idx; -} __attribute__((packed)); - -#endif /* #ifndef __PSM_HAL_GEN1_HFI1_DEPRECATED_H__ */ -#endif /* PSM_OPA */ diff --git a/psm3/hal_gen1/gen1_i2cflash.c b/psm3/hal_gen1/gen1_i2cflash.c deleted file mode 100644 index ddc2420..0000000 --- a/psm3/hal_gen1/gen1_i2cflash.c +++ /dev/null @@ -1,89 +0,0 @@ -#ifdef PSM_OPA -/* - - This file is provided under a dual BSD/GPLv2 license. When using or - redistributing this file, you may do so under either license. - - GPL LICENSE SUMMARY - - Copyright(c) 2015 Intel Corporation. - - This program is free software; you can redistribute it and/or modify - it under the terms of version 2 of the GNU General Public License as - published by the Free Software Foundation. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - Contact Information: - Intel Corporation, www.intel.com - - BSD LICENSE - - Copyright(c) 2015 Intel Corporation. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "gen1_user.h" - -uint8_t psm3_gen1_hfi_flash_csum(struct hfi_flash *ifp, int adjust) -{ - uint8_t *ip = (uint8_t *) ifp; - uint8_t csum = 0, len; - - /* - * Limit length checksummed to max length of actual data. - * Checksum of erased eeprom will still be bad, but we avoid - * reading past the end of the buffer we were passed. - */ - len = ifp->if_length; - if (len > sizeof(struct hfi_flash)) - len = sizeof(struct hfi_flash); - while (len--) - csum += *ip++; - csum -= ifp->if_csum; - csum = ~csum; - if (adjust) - ifp->if_csum = csum; - return csum; -} -#endif /* PSM_OPA */ diff --git a/psm3/hal_gen1/gen1_proto.c b/psm3/hal_gen1/gen1_proto.c deleted file mode 100644 index dff386a..0000000 --- a/psm3/hal_gen1/gen1_proto.c +++ /dev/null @@ -1,540 +0,0 @@ -#ifdef PSM_OPA -/* - - This file is provided under a dual BSD/GPLv2 license. When using or - redistributing this file, you may do so under either license. - - GPL LICENSE SUMMARY - - Copyright(c) 2015 Intel Corporation. - - This program is free software; you can redistribute it and/or modify - it under the terms of version 2 of the GNU General Public License as - published by the Free Software Foundation. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - Contact Information: - Intel Corporation, www.intel.com - - BSD LICENSE - - Copyright(c) 2015 Intel Corporation. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -/* This file contains the initialization functions used by the low - level hfi protocol code. */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "gen1_user.h" -#include "utils_debug.h" - -#include - -size_t psm3_gen1_arrsz[MAPSIZE_MAX] = { 0 }; - -static int psm3_gen1_map_hfi_mem(int fd, struct _hfi_ctrl *ctrl, size_t subctxt_cnt) -{ -#define CREDITS_NUM 64 - struct hfi1_ctxt_info *cinfo = &ctrl->ctxt_info; - struct hfi1_base_info *binfo = &ctrl->base_info; - size_t sz; - __u64 off; - void *maddr; - - /* 1. Map the PIO credits address */ - off = binfo->sc_credits_addr &~ HFI_MMAP_PGMASK; - - sz = HFI_MMAP_PGSIZE; - maddr = HFI_MMAP_ERRCHECK(fd, binfo, sc_credits_addr, sz, PROT_READ); - psm3_gen1_touch_mmap(maddr, sz); - psm3_gen1_arrsz[SC_CREDITS] = sz; - - binfo->sc_credits_addr |= off; - - /* 2. Map the PIO buffer SOP address - * Skipping the cast of cinfo->credits to size_t. This causes the outcome of the multiplication - * to be sign-extended in the event of too large input values. This results in a very large product - * when treated as unsigned which in turn will make the HFI_MMAP_ERRCHECK() macro fail and give an - * adequate error report. TODO: Consider sanitizing the credits value explicitly - */ - sz = cinfo->credits * CREDITS_NUM; - HFI_MMAP_ERRCHECK(fd, binfo, pio_bufbase_sop, sz, PROT_WRITE); - psm3_gen1_arrsz[PIO_BUFBASE_SOP] = sz; - - /* 3. Map the PIO buffer address */ - sz = cinfo->credits * CREDITS_NUM; - HFI_MMAP_ERRCHECK(fd, binfo, pio_bufbase, sz, PROT_WRITE); - psm3_gen1_arrsz[PIO_BUFBASE] = sz; - - /* 4. Map the receive header queue - * (u16 * u16 -> max value 0xfffe0001) - */ - sz = (size_t)cinfo->rcvhdrq_cnt * cinfo->rcvhdrq_entsize; - maddr = HFI_MMAP_ERRCHECK(fd, binfo, rcvhdr_bufbase, sz, PROT_READ); - psm3_gen1_touch_mmap(maddr, sz); - psm3_gen1_arrsz[RCVHDR_BUFBASE] = sz; - - /* 5. Map the receive eager buffer - * (u16 * u32. Assuming size_t's precision is 64 bits - no overflow) - */ - sz = (size_t)cinfo->egrtids * cinfo->rcvegr_size; - maddr = HFI_MMAP_ERRCHECK(fd, binfo, rcvegr_bufbase, sz, PROT_READ); - psm3_gen1_touch_mmap(maddr, sz); - psm3_gen1_arrsz[RCVEGR_BUFBASE] = sz; - - /* 6. Map the sdma completion queue */ - if (cinfo->runtime_flags & HFI1_CAP_SDMA) { - sz = cinfo->sdma_ring_size * sizeof(struct hfi1_sdma_comp_entry); - HFI_MMAP_ERRCHECK(fd, binfo, sdma_comp_bufbase, sz, PROT_READ); - } else { - sz = 0; - binfo->sdma_comp_bufbase = (__u64)0; - } - psm3_gen1_arrsz[SDMA_COMP_BUFBASE] = sz; - - /* 7. Map RXE per-context CSRs */ - sz = HFI_MMAP_PGSIZE; - HFI_MMAP_ERRCHECK(fd, binfo, user_regbase, sz, PROT_WRITE|PROT_READ); - psm3_gen1_arrsz[USER_REGBASE] = sz; - /* Set up addresses for optimized register writeback routines. - * This is for the real onchip registers, shared context or not - */ - uint64_t *regbasep = (uint64_t *)binfo->user_regbase; - ctrl->__hfi_rcvhdrtail = (volatile __le64 *)(regbasep + ur_rcvhdrtail); - ctrl->__hfi_rcvhdrhead = (volatile __le64 *)(regbasep + ur_rcvhdrhead); - ctrl->__hfi_rcvegrtail = (volatile __le64 *)(regbasep + ur_rcvegrindextail); - ctrl->__hfi_rcvegrhead = (volatile __le64 *)(regbasep + ur_rcvegrindexhead); - ctrl->__hfi_rcvofftail = (volatile __le64 *)(regbasep + ur_rcvegroffsettail); - - // mimic OPA code which never actually set HDRSUPP_ENABLED and never - // tested it here, so the PSM3_HDRSUPP env variable was never fetched - // and user could not control HDRSUPP_ENABLED - if ((cinfo->runtime_flags & HFI1_CAP_HDRSUPP) - /*&& psmi_hal_has_sw_status(PSM_HAL_HDRSUPP_ENABLED)*/) { - ctrl->__hfi_rcvtidflow = (volatile __le64 *)(regbasep + ur_rcvtidflowtable); - ctrl->__hfi_tfvalid = 1; - } else { - /* no hdr_supp hw/driver capability or - * user wants to disable header suppression */ - ctrl->__hfi_rcvtidflow = ctrl->regs; - ctrl->__hfi_tfvalid = 0; - } - - /* 8. Map the rcvhdrq tail register address */ - if (cinfo->runtime_flags & HFI1_CAP_DMA_RTAIL) { - sz = HFI_MMAP_PGSIZE; - HFI_MMAP_ERRCHECK(fd, binfo, rcvhdrtail_base, sz, PROT_READ); - } else { - /* We don't use receive header queue tail register to detect new packets, - * but here we save the address for false-eager-full recovery - */ - sz = 0; - /* This points inside the previously established mapping (user_rehbase). Don't munmap()! */ - binfo->rcvhdrtail_base = (uint64_t) (uintptr_t) ctrl->__hfi_rcvhdrtail; - } - ctrl->__hfi_rcvtail = (__le64 *)binfo->rcvhdrtail_base; - psm3_gen1_arrsz[RCVHDRTAIL_BASE] = sz; - - /* 9. Map the event page */ - off = binfo->events_bufbase &~ HFI_MMAP_PGMASK; - - sz = HFI_MMAP_PGSIZE; - HFI_MMAP_ERRCHECK(fd, binfo, events_bufbase, sz, PROT_READ); - psm3_gen1_arrsz[EVENTS_BUFBASE] = sz; - /* keep the offset in the address */ - binfo->events_bufbase |= off; - - /* 10. Map the status page */ - sz = HFI_MMAP_PGSIZE; - HFI_MMAP_ERRCHECK(fd, binfo, status_bufbase, sz, PROT_READ); - psm3_gen1_arrsz[STATUS_BUFBASE] = sz; - - if (!subctxt_cnt) - return 0; - - /* 11. If subcontext is used, map the buffers */ - const char *errstr = "Incorrect input values for the subcontext"; - size_t factor; - - /* 11a) subctxt_uregbase */ - sz = HFI_MMAP_PGSIZE; - maddr = HFI_MMAP_ERRCHECK(fd, binfo, subctxt_uregbase, sz, PROT_READ|PROT_WRITE); - psm3_gen1_touch_mmap(maddr, sz); - psm3_gen1_arrsz[SUBCTXT_UREGBASE] = sz; - - /* 11b) subctxt_rcvhdrbuf - * u16 * u16. Prevent promotion to int through an explicit cast to size_t - */ - factor = (size_t)cinfo->rcvhdrq_cnt * cinfo->rcvhdrq_entsize; - factor = ALIGN(factor, HFI_MMAP_PGSIZE); - sz = factor * subctxt_cnt; - maddr = HFI_MMAP_ERRCHECK(fd, binfo, subctxt_rcvhdrbuf, sz, PROT_READ|PROT_WRITE); - psm3_gen1_touch_mmap(maddr, sz); - psm3_gen1_arrsz[SUBCTXT_RCVHDRBUF] = sz; - - /* 11c) subctxt_rcvegrbuf - * u16 * u32. Assuming size_t's precision to be 64 bits (no overflow) - */ - factor = (size_t)cinfo->egrtids * cinfo->rcvegr_size; - factor = ALIGN(factor, HFI_MMAP_PGSIZE); - sz = factor * subctxt_cnt; - if (sz / subctxt_cnt != factor) { - _HFI_INFO("%s (rcvegrbuf)\n", errstr); - goto err_int_overflow_subctxt_rcvegrbuf; - } - maddr = HFI_MMAP_ERRCHECK(fd, binfo, subctxt_rcvegrbuf, sz, PROT_READ|PROT_WRITE); - psm3_gen1_touch_mmap(maddr, sz); - psm3_gen1_arrsz[SUBCTXT_RCVEGRBUF] = sz; - - return 0; - -err_int_overflow_subctxt_rcvegrbuf: -err_mmap_subctxt_rcvegrbuf: - /* if we got here, subctxt_cnt must be != 0 */ - HFI_MUNMAP_ERRCHECK(binfo, subctxt_rcvhdrbuf, psm3_gen1_arrsz[SUBCTXT_RCVHDRBUF]); - -err_mmap_subctxt_rcvhdrbuf: - /* if we got it here, subctxt_cnt must be != 0 */ - HFI_MUNMAP_ERRCHECK(binfo, subctxt_uregbase, psm3_gen1_arrsz[SUBCTXT_UREGBASE]); - -err_mmap_subctxt_uregbase: - HFI_MUNMAP_ERRCHECK(binfo, status_bufbase, psm3_gen1_arrsz[STATUS_BUFBASE]); - -err_mmap_status_bufbase: - HFI_MUNMAP_ERRCHECK(binfo, events_bufbase, psm3_gen1_arrsz[EVENTS_BUFBASE]); - -err_mmap_events_bufbase: - if(cinfo->runtime_flags & HFI1_CAP_DMA_RTAIL) { - HFI_MUNMAP_ERRCHECK(binfo, rcvhdrtail_base, psm3_gen1_arrsz[RCVHDRTAIL_BASE]); - } - -err_mmap_rcvhdrtail_base: - HFI_MUNMAP_ERRCHECK(binfo, user_regbase, psm3_gen1_arrsz[USER_REGBASE]); - -err_mmap_user_regbase: - /* the condition could be: if(cinfo->runtime_flags & HFI1_CAP_SDMA) too */ - if(binfo->sdma_comp_bufbase != 0) { - HFI_MUNMAP_ERRCHECK(binfo, sdma_comp_bufbase, psm3_gen1_arrsz[SDMA_COMP_BUFBASE]); - } - -err_mmap_sdma_comp_bufbase: - HFI_MUNMAP_ERRCHECK(binfo, rcvegr_bufbase, psm3_gen1_arrsz[RCVEGR_BUFBASE]); - -err_mmap_rcvegr_bufbase: - HFI_MUNMAP_ERRCHECK(binfo, rcvhdr_bufbase, psm3_gen1_arrsz[RCVHDR_BUFBASE]); - -err_mmap_rcvhdr_bufbase: - HFI_MUNMAP_ERRCHECK(binfo, pio_bufbase, psm3_gen1_arrsz[PIO_BUFBASE]); - -err_mmap_pio_bufbase: - HFI_MUNMAP_ERRCHECK(binfo, pio_bufbase_sop, psm3_gen1_arrsz[PIO_BUFBASE_SOP]); - -err_mmap_pio_bufbase_sop: - HFI_MUNMAP_ERRCHECK(binfo, sc_credits_addr, psm3_gen1_arrsz[SC_CREDITS]); - -err_mmap_sc_credits_addr: - return -1; -} - -/* It is allowed to have multiple devices (and of different types) - simultaneously opened and initialized, although this (still! Oct 07) - implemented. This routine is used by the low level hfi protocol code (and - any other code that has similar low level functionality). - This is the only routine that takes a file descriptor, rather than an - struct _hfi_ctrl *. The struct _hfi_ctrl * used for everything - else is returned as part of hfi1_base_info. -*/ -struct _hfi_ctrl *psm3_gen1_userinit_internal(int fd, bool skip_affinity, - struct hfi1_user_info_dep *uinfo) -{ - struct _hfi_ctrl *spctrl = NULL; - struct hfi1_ctxt_info *cinfo; - struct hfi1_base_info *binfo; - struct hfi1_cmd c; - int __hfi_pg_sz; -#ifdef PSM2_SUPPORT_IW_CMD_API - /* for major version 6 of driver, we will use uinfo_new. See below for details. */ - struct hfi1_user_info uinfo_new = {0}; -#endif - - /* First get the page size */ - __hfi_pg_sz = sysconf(_SC_PAGESIZE); - - if (!(spctrl = calloc(1, sizeof(struct _hfi_ctrl)))) { - _HFI_INFO("can't allocate memory for hfi_ctrl: %s\n", - strerror(errno)); - goto err_calloc_hfi_ctrl; - } - cinfo = &spctrl->ctxt_info; - binfo = &spctrl->base_info; - - _HFI_VDBG("uinfo: ver %x, alg %d, subc_cnt %d, subc_id %d\n", - uinfo->userversion, uinfo->hfi1_alg, - uinfo->subctxt_cnt, uinfo->subctxt_id); - - /* 1. ask driver to assign context to current process */ - memset(&c, 0, sizeof(struct hfi1_cmd)); - c.type = PSMI_HFI_CMD_ASSIGN_CTXT; - -#ifdef PSM2_SUPPORT_IW_CMD_API - /* If psm is communicating with a MAJOR version 6 driver, we need - to pass in an actual struct hfi1_user_info not a hfi1_user_info_dep. - Else if psm is communicating with a MAJOR version 5 driver, we can - just continue to pass a hfi1_user_info_dep as struct hfi1_user_info_dep - is identical to the MAJOR version 5 struct hfi1_user_info. */ - if (psm3_gen1_get_user_major_version() == IOCTL_CMD_API_MODULE_MAJOR) - { - /* If psm is communicating with a MAJOR version 6 driver, - we copy uinfo into uinfo_new and pass uinfo_new to the driver. */ - c.len = sizeof(uinfo_new); - c.addr = (__u64) (&uinfo_new); - - uinfo_new.userversion = uinfo->userversion; - uinfo_new.pad = uinfo->pad; - uinfo_new.subctxt_cnt = uinfo->subctxt_cnt; - uinfo_new.subctxt_id = uinfo->subctxt_id; - memcpy(uinfo_new.uuid,uinfo->uuid,sizeof(uinfo_new.uuid)); - } - else - { - /* If psm is working with an old driver, we continue to use - the struct hfi1_user_info_dep version of the struct: */ - c.len = sizeof(*uinfo); - c.addr = (__u64) uinfo; - } -#else - c.len = sizeof(*uinfo); - c.addr = (__u64) uinfo; -#endif - if (psm3_gen1_nic_cmd_write(fd, &c, sizeof(c)) == -1) { - if (errno == ENODEV) { - _HFI_INFO("PSM3 and driver version mismatch\n"); - /* Overwrite errno. One would wish that the driver - * didn't return ENODEV for a version mismatch */ - errno = EPROTONOSUPPORT; - } else { - _HFI_INFO("assign_context command failed: %s\n", - strerror(errno)); - } - goto err_hfi_cmd_assign_ctxt; - } - -#ifdef PSM2_SUPPORT_IW_CMD_API - if (psm3_gen1_get_user_major_version() == IOCTL_CMD_API_MODULE_MAJOR) - { - /* for the new driver, we copy the results of the call back to uinfo from - uinfo_new. */ - uinfo->userversion = uinfo_new.userversion; - uinfo->pad = uinfo_new.pad; - uinfo->subctxt_cnt = uinfo_new.subctxt_cnt; - uinfo->subctxt_id = uinfo_new.subctxt_id; - memcpy(uinfo->uuid,uinfo_new.uuid,sizeof(uinfo_new.uuid)); - } -#endif - - /* 2. get context info from driver */ - c.type = PSMI_HFI_CMD_CTXT_INFO; - c.len = sizeof(*cinfo); - c.addr = (__u64) cinfo; - - if (psm3_gen1_nic_cmd_write(fd, &c, sizeof(c)) == -1) { - _HFI_INFO("CTXT_INFO command failed: %s\n", strerror(errno)); - goto err_hfi_cmd_ctxt_info; - } - - /* sanity checking... */ - if (cinfo->rcvtids%8) { - _HFI_INFO("rcvtids not 8 multiple: %d\n", cinfo->rcvtids); - goto err_sanity_check; - } - if (cinfo->egrtids%8) { - _HFI_INFO("egrtids not 8 multiple: %d\n", cinfo->egrtids); - goto err_sanity_check; - } - if (cinfo->rcvtids < cinfo->egrtids) { - _HFI_INFO("rcvtids(%d) < egrtids(%d)\n", - cinfo->rcvtids, cinfo->egrtids); - goto err_sanity_check; - } - if (cinfo->rcvhdrq_cnt%32) { - _HFI_INFO("rcvhdrq_cnt not 32 multiple: %d\n", - cinfo->rcvhdrq_cnt); - goto err_sanity_check; - } - if (cinfo->rcvhdrq_entsize%64) { - _HFI_INFO("rcvhdrq_entsize not 64 multiple: %d\n", - cinfo->rcvhdrq_entsize); - goto err_sanity_check; - } - if (cinfo->rcvegr_size%__hfi_pg_sz) { - _HFI_INFO("rcvegr_size not page multiple: %d\n", - cinfo->rcvegr_size); - goto err_sanity_check; - } - - _HFI_VDBG("ctxtinfo: runtime_flags %llx, rcvegr_size %d\n", - cinfo->runtime_flags, cinfo->rcvegr_size); - _HFI_VDBG("ctxtinfo: active %d, unit %d, ctxt %d, subctxt %d\n", - cinfo->num_active, cinfo->unit, cinfo->ctxt, cinfo->subctxt); - _HFI_VDBG("ctxtinfo: rcvtids %d, credits %d\n", - cinfo->rcvtids, cinfo->credits); - _HFI_VDBG("ctxtinfo: numa %d, cpu %x, send_ctxt %d\n", - cinfo->numa_node, cinfo->rec_cpu, cinfo->send_ctxt); - _HFI_VDBG("ctxtinfo: rcvhdrq_cnt %d, rcvhdrq_entsize %d\n", - cinfo->rcvhdrq_cnt, cinfo->rcvhdrq_entsize); - _HFI_VDBG("ctxtinfo: egrtids %d, sdma_ring_size %d\n", - cinfo->egrtids, cinfo->sdma_ring_size); - - // On OPA by default this was a noop since driver - // returned -1 for cinfo->rec_cpu - /* if affinity has not been setup, set it */ - if (getenv("PSM3_FORCE_CPUAFFINITY") || - (cinfo->rec_cpu != (__u16) -1 && - !(getenv("PSM3_NO_CPUAFFINITY") || skip_affinity))) - { - cpu_set_t cpuset; - CPU_ZERO(&cpuset); - CPU_SET(cinfo->rec_cpu, &cpuset); - if (sched_setaffinity(0, sizeof(cpuset), &cpuset)) { - _HFI_INFO("Couldn't set runon processor %u " - "(unit:context %u:%u) (%u active chips): %s\n", - cinfo->rec_cpu, cinfo->unit, cinfo->ctxt, - cinfo->num_active, strerror(errno)); - } - } - - /* 4. Get user base info from driver */ - c.type = PSMI_HFI_CMD_USER_INFO; - c.len = sizeof(*binfo); - c.addr = (__u64) binfo; - - if (psm3_gen1_nic_cmd_write(fd, &c, sizeof(c)) == -1) { - _HFI_INFO("BASE_INFO command failed: %s\n", strerror(errno)); - goto err_hfi_cmd_user_info; - } - - psm3_gen1_set_user_version(binfo->sw_version); - - _HFI_VDBG("baseinfo: hwver %x, swver %x, jkey %d, qp %d\n", - binfo->hw_version, binfo->sw_version, - binfo->jkey, binfo->bthqp); - _HFI_VDBG("baseinfo: credit_addr %llx, sop %llx, pio %llx\n", - binfo->sc_credits_addr, binfo->pio_bufbase_sop, - binfo->pio_bufbase); - _HFI_VDBG("baseinfo: hdrbase %llx, egrbase %llx, sdmabase %llx\n", - binfo->rcvhdr_bufbase, binfo->rcvegr_bufbase, - binfo->sdma_comp_bufbase); - _HFI_VDBG("baseinfo: ureg %llx, eventbase %llx, " - "statusbase %llx, tailaddr %llx\n", binfo->user_regbase, - binfo->events_bufbase, binfo->status_bufbase, - binfo->rcvhdrtail_base); - - /* - * Check if driver version matches PSM version, - * this is different from PSM API version. - */ - if ((binfo->sw_version >> HFI1_SWMAJOR_SHIFT) != psm3_gen1_get_user_major_version()) { - _HFI_INFO - ("User major version 0x%x not same as driver major 0x%x\n", - psm3_gen1_get_user_major_version(), binfo->sw_version >> HFI1_SWMAJOR_SHIFT); - if ((binfo->sw_version >> HFI1_SWMAJOR_SHIFT) < psm3_gen1_get_user_major_version()) - goto err_version_mismatch; /* else assume driver knows how to be compatible */ - } else if ((binfo->sw_version & 0xffff) != HFI1_USER_SWMINOR) { - _HFI_PRDBG - ("User minor version 0x%x not same as driver minor 0x%x\n", - HFI1_USER_SWMINOR, binfo->sw_version & 0xffff); - } - - if (psm3_gen1_map_hfi_mem(fd, spctrl, uinfo->subctxt_cnt) == -1) - goto err_map_hfi_mem; - - /* Save some info. */ - spctrl->fd = fd; - spctrl->__hfi_unit = cinfo->unit; - /* - * driver should provide the port where the context is opened for, But - * OPA driver does not have port interface to psm because there is only - * one port. So we hardcode the port to 1 here. When we work on the - * version of PSM for the successor to OPA, we should have port returned - * from driver and will be set accordingly. - */ - /* spctrl->__hfi_port = cinfo->port; */ - spctrl->__hfi_port = 1; - spctrl->__hfi_tidegrcnt = cinfo->egrtids; - spctrl->__hfi_tidexpcnt = cinfo->rcvtids - cinfo->egrtids; - - return spctrl; - -err_map_hfi_mem: -err_version_mismatch: -err_hfi_cmd_user_info: - /* TODO: restore the original CPU affinity? */ - -err_sanity_check: -err_hfi_cmd_ctxt_info: - /* TODO: ioctl de-assign context here? */ - // without de-assigning the context, all subsequent psm3_gen1_userinit_internal() - // calls are going to fail - _HFI_ERROR("An unrecoverable error occurred while communicating with the driver\n"); - abort(); /* TODO: or do we want to include psm_user.h to use psm3_handle_error()? */ - // no recovery here - - /* if we failed to allocate memory or to assign the context, we might still recover from this. - * Returning NULL will cause the function to be reinvoked n times. Do we really want this - * behavior? - */ -err_hfi_cmd_assign_ctxt: - free(spctrl); - -err_calloc_hfi_ctrl: - return NULL; -} - -struct _hfi_ctrl *psm3_gen1_userinit(int fd, struct hfi1_user_info_dep *uinfo) -{ - return psm3_gen1_userinit_internal(fd, false, uinfo); -} -#endif /* PSM_OPA */ diff --git a/psm3/hal_gen1/gen1_ptl_ips.c b/psm3/hal_gen1/gen1_ptl_ips.c deleted file mode 100644 index f6db26d..0000000 --- a/psm3/hal_gen1/gen1_ptl_ips.c +++ /dev/null @@ -1,1634 +0,0 @@ -#ifdef PSM_OPA -/* - - This file is provided under a dual BSD/GPLv2 license. When using or - redistributing this file, you may do so under either license. - - GPL LICENSE SUMMARY - - Copyright(c) 2021 Intel Corporation. - - This program is free software; you can redistribute it and/or modify - it under the terms of version 2 of the GNU General Public License as - published by the Free Software Foundation. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - Contact Information: - Intel Corporation, www.intel.com - - BSD LICENSE - - Copyright(c) 2021 Intel Corporation. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -/* Copyright (c) 2003-2021 Intel Corporation. All rights reserved. */ - -/* This file implements the HAL specific code for PSM PTL for ips */ -#include "psm_user.h" -#include "psm2_hal.h" -#include "ptl_ips.h" -#include "psm_mq_internal.h" -#include "gen1_hal.h" -#include "gen1_spio.c" // TBD make this a normal .c file, just needed spio_init - -/* - * Sample implementation of shared contexts context. - * - * In shared mode, the hardware queue is serviced by more than one process. - * Each process also mirrors the hardware queue in software (represented by an - * ips_recvhdrq). For packets we service in the hardware queue that are not - * destined for us, we write them in other processes's receive queues - * (represented by an gen1_ips_writehdrq). - * - */ -struct gen1_ptl_shared { - ptl_t *ptl; /* backptr to main ptl */ - uint32_t context; - uint32_t subcontext; - uint32_t subcontext_cnt; - - pthread_spinlock_t *context_lock; - struct gen1_ips_subcontext_ureg *subcontext_ureg[PSM_HAL_MAX_SHARED_CTXTS]; - struct gen1_ips_hwcontext_ctrl *hwcontext_ctrl; - struct ips_recvhdrq recvq; /* subcontext receive queue */ - struct ips_recvhdrq_state recvq_state; /* subcontext receive queue state */ - struct gen1_ips_writehdrq writeq[PSM_HAL_MAX_SHARED_CTXTS]; /* peer subcontexts */ -}; - -psm2_error_t psm3_gen1_ips_ptl_poll(ptl_t *ptl_gen, int _ignored); -int psm3_gen1_ips_ptl_recvq_isempty(const struct ptl *ptl); -psm2_error_t psm3_gen1_ips_ptl_shared_poll(ptl_t *ptl, int _ignored); - -static inline int psm3_gen1_get_sc2vl_map(struct ips_proto *proto) -{ - hfp_gen1_pc_private *psm_hw_ctxt = proto->ep->context.psm_hw_ctxt; - uint8_t i; - - /* Get SC2VL table for unit, port */ - for (i = 0; i < PSMI_N_SCS; i++) { - int ret = psm3_gen1_get_port_sc2vl(proto->ep->unit_id, - proto->ep->portnum, i); - if (ret < 0) - /* Unable to get SC2VL. Set it to default */ - ret = PSMI_VL_DEFAULT; - - psm_hw_ctxt->sc2vl[i] = (uint16_t) ret; - } - return PSM_HAL_ERROR_OK; -} - -/* (Re)load the SL2SC table */ -void psm3_gen1_ips_ptl_init_sl2sc_table(struct ips_proto *proto) -{ - int ret, i; - - /* Get SL2SC table for unit, port */ - for (i = 0; i < PSMI_N_SCS; i++) { - if ((ret = - psm3_gen1_get_port_sl2sc(proto->ep->unit_id, - proto->ep->portnum, (uint8_t) i)) < 0) { - /* Unable to get SL2SC. Set it to default */ - ret = PSMI_SC_DEFAULT; - } - - proto->sl2sc[i] = (uint16_t) ret; - } - psm3_gen1_get_sc2vl_map(proto); -} - -static inline int psm3_hfp_gen1_write_header_to_subcontext(struct ips_message_header *pimh, - psm3_gen1_cl_idx idx, - psm3_gen1_raw_rhf_t rhf, - psm3_gen1_cl_q cl_q, - psmi_hal_hw_context ctxt) -{ - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - psm3_gen1_cl_q_t *pcl_q = &psm_hw_ctxt->cl_qs[cl_q]; - uint32_t *pu32 = pcl_q->hdr_qe.hdrq_base_addr + (idx + psm3_gen1_hdrget_hdrq_offset((uint32_t *)&rhf)); - struct ips_message_header *piph_dest = (struct ips_message_header *)pu32; - - *piph_dest = *pimh; - return PSM_HAL_ERROR_OK; -} - -static inline -int -psm3_gen1_write_eager_packet(struct gen1_ips_writehdrq *writeq, - struct ips_recvhdrq_event *rcv_ev, - psm3_gen1_cl_idx write_hdr_tail, - uint32_t subcontext, - psmi_hal_hw_context ctxt) -{ - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; - psm3_gen1_cl_idx write_egr_tail; - write_egr_tail = psm3_gen1_get_cl_q_tail_index( - PSM3_GEN1_GET_SC_CL_Q_RX_EGR_Q(subcontext), - ctxt); - uint32_t next_write_egr_tail = write_egr_tail; - /* checksum is trimmed from paylen, we need to add back */ - uint32_t rcv_paylen = ips_recvhdrq_event_paylen(rcv_ev) + - (rcv_ev->has_cksum ? PSM_CRC_SIZE_IN_BYTES : 0); - psmi_assert(rcv_paylen > 0); - uint32_t egr_elemcnt = ctrl->ctxt_info.egrtids; - uint32_t egr_elemsz = ctrl->ctxt_info.rcvegr_size; - - /* Loop as long as the write eager queue is NOT full */ - while (1) { - next_write_egr_tail++; - if (next_write_egr_tail >= egr_elemcnt) - next_write_egr_tail = 0; - psm3_gen1_cl_idx egr_head; - egr_head = psm3_gen1_get_cl_q_head_index( - PSM3_GEN1_GET_SC_CL_Q_RX_EGR_Q(subcontext), - ctxt); - if (next_write_egr_tail == egr_head) { - break; - } - - /* Move to next eager entry if leftover is not enough */ - if ((writeq->state->egrq_offset + rcv_paylen) > - egr_elemsz) { - writeq->state->egrq_offset = 0; - write_egr_tail = next_write_egr_tail; - - /* Update the eager buffer tail pointer */ - psm3_gen1_set_cl_q_tail_index(write_egr_tail, - PSM3_GEN1_GET_SC_CL_Q_RX_EGR_Q(subcontext), - ctxt); - } else { - /* There is enough space in this entry! */ - /* Use pre-calculated address from look-up table */ - char *write_payload = - psm_hw_ctxt->cl_qs[PSM3_GEN1_GET_SC_CL_Q_RX_EGR_Q(subcontext)].egr_buffs[write_egr_tail] - + writeq->state->egrq_offset; - const char *rcv_payload = - ips_recvhdrq_event_payload(rcv_ev); - - psmi_assert(write_payload != NULL); - psmi_assert(rcv_payload != NULL); - psm3_mq_mtucpy(write_payload, rcv_payload, rcv_paylen); - - /* Fix up the rhf with the subcontext's eager index/offset */ - psm3_gen1_hdrset_egrbfr_index((uint32_t*)(&rcv_ev->gen1_rhf.raw_rhf),write_egr_tail); - psm3_gen1_hdrset_egrbfr_offset((uint32_t *)(&rcv_ev->gen1_rhf.raw_rhf), (writeq->state-> - egrq_offset >> 6)); - /* Copy the header to the subcontext's header queue */ - psm3_hfp_gen1_write_header_to_subcontext(rcv_ev->p_hdr, - write_hdr_tail, - rcv_ev->gen1_rhf.raw_rhf, - PSM3_GEN1_GET_SC_CL_Q_RX_HDR_Q(subcontext), - ctxt); - - /* Update offset to next 64B boundary */ - writeq->state->egrq_offset = - (writeq->state->egrq_offset + rcv_paylen + - 63) & (~63); - return IPS_RECVHDRQ_CONTINUE; - } - } - - /* At this point, the eager queue is full -- drop the packet. */ - /* Copy the header to the subcontext's header queue */ - - /* Mark header with ETIDERR (eager overflow) */ - psm3_gen1_hdrset_err_flags((uint32_t*) (&rcv_ev->gen1_rhf.raw_rhf), HFI_RHF_TIDERR); - - /* Clear UseEgrBfr bit because payload is dropped */ - psm3_gen1_hdrset_use_egrbfr((uint32_t *)(&rcv_ev->gen1_rhf.raw_rhf), 0); - psm3_hfp_gen1_write_header_to_subcontext(rcv_ev->p_hdr, - write_hdr_tail, - rcv_ev->gen1_rhf.raw_rhf, - PSM3_GEN1_GET_SC_CL_Q_RX_HDR_Q(subcontext), - ctxt); - return IPS_RECVHDRQ_BREAK; -} - -static inline -void -psm3_gen1_writehdrq_write_rhf_atomic(uint64_t *rhf_dest, uint64_t rhf_src) -{ - /* - * In 64-bit mode, we check in init that the rhf will always be 8-byte - * aligned - */ - *rhf_dest = rhf_src; - return; -} - -static inline int psm3_hfp_gen1_write_rhf_to_subcontext(psm3_gen1_raw_rhf_t rhf, - psm3_gen1_cl_idx idx, - uint32_t *phdrq_rhf_seq, - psm3_gen1_cl_q cl_q, - psmi_hal_hw_context ctxt) -{ - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - psm3_gen1_cl_q_t *pcl_q = &psm_hw_ctxt->cl_qs[cl_q]; - - if (!get_psm_gen1_hi()->hfp_private.dma_rtail) - { - uint32_t rhf_seq = *phdrq_rhf_seq; - psm3_gen1_hdrset_seq((uint32_t *) &rhf, rhf_seq); - rhf_seq++; - if (rhf_seq > LAST_RHF_SEQNO) - rhf_seq = 1; - - *phdrq_rhf_seq = rhf_seq; - } - - /* Now write the new rhf */ - psm3_gen1_writehdrq_write_rhf_atomic((uint64_t*)(pcl_q->hdr_qe.hdrq_base_addr + - (idx + get_psm_gen1_hi()->hfp_private.hdrq_rhf_off)), - rhf); - return PSM_HAL_ERROR_OK; -} - -static -int -psm3_gen1_ips_subcontext_ignore(struct ips_recvhdrq_event *rcv_ev, - uint32_t subcontext) -{ - return IPS_RECVHDRQ_CONTINUE; -} - -static inline -int -psm3_gen1_forward_packet_to_subcontext(struct gen1_ips_writehdrq *writeq, - struct ips_recvhdrq_event *rcv_ev, - uint32_t subcontext, - psmi_hal_hw_context ctxt) -{ - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; - psm3_gen1_cl_idx write_hdr_head; - psm3_gen1_cl_idx write_hdr_tail; - uint32_t hdrq_elemsz = ctrl->ctxt_info.rcvhdrq_entsize >> BYTE2DWORD_SHIFT; - psm3_gen1_cl_idx next_write_hdr_tail; - int result = IPS_RECVHDRQ_CONTINUE; - - /* Drop packet if write header queue is disabled */ - if_pf (!writeq->state->enabled) { - return IPS_RECVHDRQ_BREAK; - } - - write_hdr_head = psm3_gen1_get_cl_q_head_index( - PSM3_GEN1_GET_SC_CL_Q_RX_HDR_Q(subcontext), - ctxt); - write_hdr_tail = psm3_gen1_get_cl_q_tail_index( - PSM3_GEN1_GET_SC_CL_Q_RX_HDR_Q(subcontext), - ctxt); - /* Drop packet if write header queue is full */ - next_write_hdr_tail = write_hdr_tail + hdrq_elemsz; - if (next_write_hdr_tail > writeq->hdrq_elemlast) { - next_write_hdr_tail = 0; - } - if (next_write_hdr_tail == write_hdr_head) { - return IPS_RECVHDRQ_BREAK; - } - // could test rcv_ev->payload instead of use_egr_buff - if (psm3_gen1_rhf_get_use_egr_buff(rcv_ev->gen1_rhf)) - { - result = psm3_gen1_write_eager_packet(writeq, rcv_ev, - write_hdr_tail, - subcontext, - ctxt); - } else { - /* Copy the header to the subcontext's header queue */ - psm3_hfp_gen1_write_header_to_subcontext(rcv_ev->p_hdr, - write_hdr_tail, - rcv_ev->gen1_rhf.raw_rhf, - PSM3_GEN1_GET_SC_CL_Q_RX_HDR_Q(subcontext), - ctxt); - } - - /* Ensure previous writes are visible before writing rhf seq or tail */ - ips_wmb(); - - /* The following func call may modify the hdrq_rhf_seq */ - psm3_hfp_gen1_write_rhf_to_subcontext(rcv_ev->gen1_rhf.raw_rhf, write_hdr_tail, - &writeq->state->hdrq_rhf_seq, - PSM3_GEN1_GET_SC_CL_Q_RX_HDR_Q(subcontext), - ctxt); - /* The tail must be updated regardless of PSM_HAL_CAP_DMA_RTAIL - * since this tail is also used to keep track of where - * to write to next. For subcontexts there is - * no separate shadow copy of the tail. */ - psm3_gen1_set_cl_q_tail_index(next_write_hdr_tail, - PSM3_GEN1_GET_SC_CL_Q_RX_HDR_Q(subcontext), - ctxt); - - return result; -} - -static -int -psm3_gen1_ips_subcontext_process(struct ips_recvhdrq_event *rcv_ev, - uint32_t subcontext) -{ - struct gen1_ptl_shared *recvshc = ((struct ptl_ips *)(rcv_ev->proto->ptl))->recvshc; - if_pt(subcontext != recvshc->subcontext && - subcontext < recvshc->subcontext_cnt) { - return psm3_gen1_forward_packet_to_subcontext(&recvshc->writeq[subcontext], - rcv_ev, subcontext, - rcv_ev->recvq->context->psm_hw_ctxt); - } - else { - _HFI_VDBG - ("Drop pkt for subcontext %d out of %d (I am %d) : errors 0x%x\n", - (int)subcontext, (int)recvshc->subcontext_cnt, - (int)recvshc->subcontext, psm3_gen1_rhf_get_all_err_flags(rcv_ev->gen1_rhf)); - return IPS_RECVHDRQ_BREAK; - } -} - -static psm2_error_t psm3_gen1_shrecvq_init(ptl_t *ptl, const psmi_context_t *context); -static psm2_error_t psm3_gen1_shrecvq_fini(ptl_t *ptl); - -static inline int psm3_gen1_subcontext_ureg_get(ptl_t *ptl_gen, - struct gen1_ips_subcontext_ureg **uregp, - psmi_hal_hw_context ctxt) -{ - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - int i; - struct ptl_ips *ptl = (struct ptl_ips *) ptl_gen; - - ptl->recvshc->hwcontext_ctrl = psm_hw_ctxt->hwcontext_ctrl; - for (i=0;i < psm_hw_ctxt->user_info.subctxt_cnt; i++) - uregp[i] = psm_hw_ctxt->subcontext_ureg[i]; - return PSM_HAL_ERROR_OK; -} - -// initialize HAL specific parts of ptl_ips -// This is called after most of the generic aspects have been initialized -// so we can use ptl->ep, ptl->ctl, etc as needed -// However it is called prior to ips_proto_init. ips_proto_init requires some -// ips_ptl items such as ptl->spioc -psm2_error_t psm3_gen1_ips_ptl_init_pre_proto_init(struct ptl_ips *ptl) -{ - psm2_error_t err = PSM2_OK; - const psmi_context_t *context = &ptl->ep->context; - const int enable_shcontexts = (psmi_hal_get_subctxt_cnt(context->psm_hw_ctxt) > 0); - ptl->ctl->ep_poll = enable_shcontexts ? psm3_gen1_ips_ptl_shared_poll : psm3_gen1_ips_ptl_poll; - /* - * Context sharing, setup subcontext ureg page. - */ - if (enable_shcontexts) { - struct gen1_ptl_shared *recvshc; - - recvshc = (struct gen1_ptl_shared *) - psmi_calloc(ptl->ep, UNDEFINED, 1, sizeof(struct gen1_ptl_shared)); - if (recvshc == NULL) { - err = PSM2_NO_MEMORY; - goto fail; - } - - ptl->recvshc = recvshc; - recvshc->ptl = (ptl_t *)ptl; - - /* Initialize recvshc fields */ - recvshc->context = psm3_gen1_get_context(context->psm_hw_ctxt); - recvshc->subcontext = psmi_hal_get_subctxt(context->psm_hw_ctxt); - recvshc->subcontext_cnt = psmi_hal_get_subctxt_cnt(context->psm_hw_ctxt); - psmi_assert_always(recvshc->subcontext_cnt <= - PSM_HAL_MAX_SHARED_CTXTS); - psmi_assert_always(recvshc->subcontext < - recvshc->subcontext_cnt); - - /* - * Using ep->context to avoid const modifier since this function - * will modify the content in ep->context. - */ - if ((err = psm3_gen1_subcontext_ureg_get((ptl_t *)ptl, - recvshc->subcontext_ureg, context->psm_hw_ctxt))) - goto fail; - - /* Note that the GEN1 HAL instance initializes struct gen1_ips_subcontext_ureg - during context open. */ - - recvshc->context_lock = &recvshc->hwcontext_ctrl->context_lock; - if (recvshc->subcontext == 0) { - if (pthread_spin_init(recvshc->context_lock, - PTHREAD_PROCESS_SHARED) != 0) { - err = - psm3_handle_error(ptl->ep, - PSM2_EP_DEVICE_FAILURE, - "Couldn't initialize process-shared spin lock"); - goto fail; - } - } - } - /* - * Hardware send pio used by eager and control messages. - */ - if ((err = psm3_gen1_spio_init(context, (ptl_t *)ptl, &ptl->spioc))) - goto fail; -fail: - return err; -} - -// initialize HAL specific parts of ptl_ips -// This is called after after ips_proto_init and after most of the generic -// aspects of ips_ptl have been initialized -// so we can use ptl->ep and ptl->proto as needed -psm2_error_t psm3_gen1_ips_ptl_init_post_proto_init(struct ptl_ips *ptl) -{ - psm2_error_t err = PSM2_OK; - const psmi_context_t *context = &ptl->ep->context; - const int enable_shcontexts = (psmi_hal_get_subctxt_cnt(context->psm_hw_ctxt) > 0); - /* - * Hardware receive hdr/egr queue, services incoming packets and issues - * callbacks for protocol handling in proto_recv. It uses the epstate - * interface to determine if a packet is known or unknown. - */ - if (!enable_shcontexts) { - struct ips_recvhdrq_callbacks recvq_callbacks; - recvq_callbacks.callback_packet_unknown = - psm3_gen1_ips_ptl_process_unknown; - recvq_callbacks.callback_subcontext = psm3_gen1_ips_subcontext_ignore; - recvq_callbacks.callback_error = psm3_gen1_ips_ptl_process_packet_error; - if ((err = - psm3_gen1_recvhdrq_init(context, &ptl->epstate, &ptl->proto, - &recvq_callbacks, - 0, - &ptl->recvq - ,&ptl->recvq_state, - PSM3_GEN1_CL_Q_RX_HDR_Q))) - goto fail; - } - /* - * Software receive hdr/egr queue, used in shared contexts. - */ - else if ((err = psm3_gen1_shrecvq_init((ptl_t*)ptl, context))) - goto fail; -fail: - return err; -} - -// finalize HAL specific parts of ptl_ips -// This is called before the generic aspects have been finalized -// but after ips_proto has been finalized -// so we can use ptl->ep as needed -psm2_error_t psm3_gen1_ips_ptl_fini(struct ptl_ips *ptl) -{ - psm2_error_t err = PSM2_OK; - const int enable_shcontexts = (psmi_hal_get_subctxt_cnt(ptl->ep->context.psm_hw_ctxt) > 0); - if ((err = psm3_gen1_spio_fini(&ptl->spioc, ptl->ep->context.psm_hw_ctxt))) - goto fail; - if (enable_shcontexts && (err = psm3_gen1_shrecvq_fini((ptl_t*)ptl))) - goto fail; -fail: - return err; -} - -psm2_error_t psm3_gen1_ips_ptl_poll(ptl_t *ptl_gen, int _ignored) -{ - struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen; - const uint64_t current_count = get_cycles(); - const int do_lock = PSMI_LOCK_DISABLED && - psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED); - psm2_error_t err = PSM2_OK_NO_PROGRESS; - psm2_error_t err2; - - if (!psm3_gen1_recvhdrq_isempty(&ptl->recvq)) { - if (do_lock && !ips_recvhdrq_trylock(&ptl->recvq)) - return err; - if (ptl->recvq.proto->flags & IPS_PROTO_FLAG_CCA_PRESCAN) { - psm3_gen1_recvhdrq_scan_cca(&ptl->recvq); - } - err = psm3_gen1_recvhdrq_progress(&ptl->recvq); - if (do_lock) - ips_recvhdrq_unlock(&ptl->recvq); - if_pf(err > PSM2_OK_NO_PROGRESS) - return err; - err2 = - psmi_timer_process_if_expired(&(ptl->timerq), - current_count); - if (err2 != PSM2_OK_NO_PROGRESS) - return err2; - else - return err; - } - - /* - * Process timer expirations after servicing receive queues (some packets - * may have been acked, some requests-to-send may have been queued). - * - * It's safe to look at the timer without holding the lock because it's not - * incorrect to be wrong some of the time. - */ - if (psmi_timer_is_expired(&(ptl->timerq), current_count)) { - if (do_lock) - ips_recvhdrq_lock(&ptl->recvq); - err = psm3_timer_process_expired(&(ptl->timerq), current_count); - if (do_lock) - ips_recvhdrq_unlock(&ptl->recvq); - } - - return err; -} - -PSMI_INLINE(int psm3_gen1_ips_try_lock_shared_context(struct gen1_ptl_shared *recvshc)) -{ - return pthread_spin_trylock(recvshc->context_lock); -} -/* Unused -PSMI_INLINE(void psm3_gen1_ips_lock_shared_context(struct gen1_ptl_shared *recvshc)) -{ - pthread_spin_lock(recvshc->context_lock); -} -*/ -PSMI_INLINE(void psm3_gen1_ips_unlock_shared_context(struct gen1_ptl_shared *recvshc)) -{ - pthread_spin_unlock(recvshc->context_lock); -} - -psm2_error_t psm3_gen1_ips_ptl_shared_poll(ptl_t *ptl_gen, int _ignored) -{ - struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen; - const uint64_t current_count = get_cycles(); - psm2_error_t err = PSM2_OK_NO_PROGRESS; - psm2_error_t err2; - struct gen1_ptl_shared *recvshc = ptl->recvshc; - psmi_assert(recvshc != NULL); - - /* The following header queue checks are speculative (but safe) - * until this process has acquired the lock. The idea is to - * minimize lock contention due to processes spinning on the - * shared context. */ - if (psm3_gen1_recvhdrq_isempty(&recvshc->recvq)) { - if (!psm3_gen1_recvhdrq_isempty(&ptl->recvq) && - psm3_gen1_ips_try_lock_shared_context(recvshc) == 0) { - /* check that subcontext is empty while under lock to avoid - * re-ordering of incoming packets (since packets from - * hardware context will be processed immediately). */ - if_pt(psm3_gen1_recvhdrq_isempty(&recvshc->recvq)) { - if (ptl->recvq.proto->flags & IPS_PROTO_FLAG_CCA_PRESCAN) { - psm3_gen1_recvhdrq_scan_cca(&ptl->recvq); - } - err = psm3_gen1_recvhdrq_progress(&ptl->recvq); - } - psm3_gen1_ips_unlock_shared_context(recvshc); - } - } - - if_pf(err > PSM2_OK_NO_PROGRESS) - return err; - - if (!psm3_gen1_recvhdrq_isempty(&recvshc->recvq)) { - if (recvshc->recvq.proto->flags & IPS_PROTO_FLAG_CCA_PRESCAN) { - psm3_gen1_recvhdrq_scan_cca(&recvshc->recvq); - } - err2 = psm3_gen1_recvhdrq_progress(&recvshc->recvq); - if (err2 != PSM2_OK_NO_PROGRESS) { - err = err2; - } - } - - if_pf(err > PSM2_OK_NO_PROGRESS) - return err; - - /* - * Process timer expirations after servicing receive queues (some packets - * may have been acked, some requests-to-send may have been queued). - */ - err2 = psmi_timer_process_if_expired(&(ptl->timerq), current_count); - if (err2 != PSM2_OK_NO_PROGRESS) - err = err2; - - return err; -} - -int psm3_gen1_ips_ptl_recvq_isempty(const ptl_t *ptl_gen) -{ - struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen; - struct gen1_ptl_shared *recvshc = ptl->recvshc; - - if (recvshc != NULL && !psm3_gen1_recvhdrq_isempty(&recvshc->recvq)) - return 0; - return psm3_gen1_recvhdrq_isempty(&ptl->recvq); -} - -static psm2_error_t -psm3_gen1_ips_ptl_writehdrq_init(const psmi_context_t *context, - struct gen1_ips_writehdrq *writeq, - struct gen1_ips_writehdrq_state *state, - uint32_t subcontext) -{ - uint32_t elemsz = psm3_gen1_get_rx_hdr_q_ent_size(context->psm_hw_ctxt), - elemcnt = psm3_gen1_get_rx_hdr_q_cnt(context->psm_hw_ctxt); - - memset(writeq, 0, sizeof(*writeq)); - writeq->context = context; - writeq->state = state; - writeq->hdrq_elemlast = (elemcnt - 1) * (elemsz >> BYTE2DWORD_SHIFT); - - writeq->state->enabled = 1; - return PSM2_OK; -} - -static psm2_error_t psm3_gen1_shrecvq_init(ptl_t *ptl_gen, const psmi_context_t *context) -{ - struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen; - struct gen1_ptl_shared *recvshc = ptl->recvshc; - struct ips_recvhdrq_callbacks recvq_callbacks; - psm2_error_t err = PSM2_OK; - int i; - - /* Initialize (shared) hardware context recvq (ptl->recvq) */ - /* NOTE: uses recvq in ptl structure for shared h/w context */ - recvq_callbacks.callback_packet_unknown = psm3_gen1_ips_ptl_process_unknown; - recvq_callbacks.callback_subcontext = psm3_gen1_ips_subcontext_process; - recvq_callbacks.callback_error = psm3_gen1_ips_ptl_process_packet_error; - if ((err = psm3_gen1_recvhdrq_init(context, &ptl->epstate, &ptl->proto, - &recvq_callbacks, - recvshc->subcontext, - &ptl->recvq, - &recvshc->hwcontext_ctrl->recvq_state, - PSM3_GEN1_CL_Q_RX_HDR_Q))) { - goto fail; - } - - /* Initialize software subcontext (recvshc->recvq). Subcontexts do */ - /* not require the rcvhdr copy feature. */ - recvq_callbacks.callback_subcontext = psm3_gen1_ips_subcontext_ignore; - if ((err = psm3_gen1_recvhdrq_init(context, &ptl->epstate, &ptl->proto, - &recvq_callbacks, - recvshc->subcontext, - &recvshc->recvq, &recvshc->recvq_state, - PSM3_GEN1_GET_SC_CL_Q_RX_HDR_Q(recvshc->subcontext)))) { - goto fail; - } - - /* Initialize each recvshc->writeq for shared contexts */ - for (i = 0; i < recvshc->subcontext_cnt; i++) { - if ((err = psm3_gen1_ips_ptl_writehdrq_init(context, - &recvshc->writeq[i], - &recvshc->subcontext_ureg[i]-> - writeq_state, - i))) { - goto fail; - } - } - - if (err == PSM2_OK) - _HFI_DBG - ("Context sharing in use: %s, context %d, sub-context %d\n", - psm3_epid_fmt_addr(ptl->epid, 0), recvshc->context, - recvshc->subcontext); -fail: - return err; -} - -static psm2_error_t psm3_gen1_shrecvq_fini(ptl_t *ptl_gen) -{ - struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen; - psm2_error_t err = PSM2_OK; - int i; - - /* disable my write header queue before deallocation */ - i = ptl->recvshc->subcontext; - ptl->recvshc->subcontext_ureg[i]->writeq_state.enabled = 0; - psmi_free(ptl->recvshc); - return err; -} - - -#ifdef PSM2_MOCK_TESTING -void psm3_gen1_ips_ptl_non_dw_mul_sdma_init(void) -{ - uint16_t major_version = psm3_gen1_get_user_major_version(); - uint16_t minor_version = psm3_gen1_get_user_minor_version(); - int allow_non_dw_mul = 0; - - if ((major_version > HFI1_USER_SWMAJOR_NON_DW_MUL_MSG_SIZE_ALLOWED) || - ((major_version == HFI1_USER_SWMAJOR_NON_DW_MUL_MSG_SIZE_ALLOWED) && - (minor_version >= HFI1_USER_SWMINOR_NON_DW_MUL_MSG_SIZE_ALLOWED))) - { - allow_non_dw_mul = 1; - } - psm3_hal_current_hal_instance->params.cap_mask = 0; - if (allow_non_dw_mul) - psm3_hal_current_hal_instance->params.cap_mask |= PSM_HAL_CAP_NON_DW_MULTIPLE_MSG_SIZE; -} -#endif /* PSM2_MOCK_TESTING */ - -/* linux doesn't have strlcat; this is a stripped down implementation */ -/* not super-efficient, but we use it rarely, and only for short strings */ -/* not fully standards conforming! */ -static size_t strlcat(char *d, const char *s, size_t l) -{ - int dlen = strlen(d), slen, max; - if (l <= dlen) /* bug */ - return l; - slen = strlen(s); - max = l - (dlen + 1); - if (slen > max) - slen = max; - memcpy(d + dlen, s, slen); - d[dlen + slen] = '\0'; - return dlen + slen + 1; /* standard says to return full length, not actual */ -} - -void psm3_gen1_ips_ptl_dump_err_stats(struct ips_proto *proto) -{ - char err_stat_msg[2048]; - char tmp_buf[128]; - int len = sizeof(err_stat_msg); - - if (!(psm3_dbgmask & __HFI_PKTDBG)) - return; - - *err_stat_msg = '\0'; - - if (proto->error_stats.num_icrc_err || - proto->error_stats.num_ecc_err || - proto->error_stats.num_len_err || - proto->error_stats.num_tid_err || - proto->error_stats.num_dc_err || - proto->error_stats.num_dcunc_err || - proto->error_stats.num_khdrlen_err) { - - snprintf(tmp_buf, sizeof(tmp_buf), "ERROR STATS: "); - - if (proto->error_stats.num_icrc_err) { - snprintf(tmp_buf, sizeof(tmp_buf), "ICRC: %" PRIu64 " ", - proto->error_stats.num_icrc_err); - strlcat(err_stat_msg, tmp_buf, len); - } - - if (proto->error_stats.num_ecc_err) { - snprintf(tmp_buf, sizeof(tmp_buf), "ECC: %" PRIu64 " ", - proto->error_stats.num_ecc_err); - strlcat(err_stat_msg, tmp_buf, len); - } - - if (proto->error_stats.num_len_err) { - snprintf(tmp_buf, sizeof(tmp_buf), "LEN: %" PRIu64 " ", - proto->error_stats.num_len_err); - strlcat(err_stat_msg, tmp_buf, len); - } - - if (proto->error_stats.num_tid_err) { - snprintf(tmp_buf, sizeof(tmp_buf), "TID: %" PRIu64 " ", - proto->error_stats.num_tid_err); - strlcat(err_stat_msg, tmp_buf, len); - } - - if (proto->error_stats.num_dc_err) { - snprintf(tmp_buf, sizeof(tmp_buf), "DC: %" PRIu64 " ", - proto->error_stats.num_dc_err); - strlcat(err_stat_msg, tmp_buf, len); - } - - if (proto->error_stats.num_dcunc_err) { - snprintf(tmp_buf, sizeof(tmp_buf), - "DCUNC: %" PRIu64 " ", - proto->error_stats.num_dcunc_err); - strlcat(err_stat_msg, tmp_buf, len); - } - - if (proto->error_stats.num_khdrlen_err) { - snprintf(tmp_buf, sizeof(tmp_buf), - "KHDRLEN: %" PRIu64 " ", - proto->error_stats.num_khdrlen_err); - strlcat(err_stat_msg, tmp_buf, len); - } - strlcat(err_stat_msg, "\n", len); - } else - strlcat(err_stat_msg, "No previous errors.\n", len); - - _HFI_ERROR("%s", err_stat_msg); -} - -int -psm3_gen1_ips_ptl_process_err_chk_gen(struct ips_recvhdrq_event *rcv_ev) -{ - struct ips_recvhdrq *recvq = (struct ips_recvhdrq *)rcv_ev->recvq; - struct ips_message_header *p_hdr = rcv_ev->p_hdr; - struct ips_protoexp *protoexp = recvq->proto->protoexp; - struct ips_tid_recv_desc *tidrecvc; - psmi_seqnum_t err_seqnum, recvseq; - ptl_arg_t desc_id = p_hdr->data[0]; - ptl_arg_t send_desc_id = p_hdr->data[1]; - int16_t seq_off; - uint8_t ack_type; - ips_scb_t ctrlscb; - - INC_TIME_SPEND(TIME_SPEND_USER4); - PSM2_LOG_MSG("entering"); - recvq->proto->epaddr_stats.err_chk_recv++; - - /* Ignore FECN bit since this is the control path */ - rcv_ev->is_congested &= ~IPS_RECV_EVENT_FECN; - - /* Get the flowgenseq for err chk gen */ - err_seqnum.psn_val = __be32_to_cpu(p_hdr->bth[2]); - - /* Get receive descriptor */ - psmi_assert(desc_id._desc_idx < HFI_TF_NFLOWS); - tidrecvc = &protoexp->tfc.tidrecvc[desc_id._desc_idx]; - - if (tidrecvc->rdescid._desc_genc != desc_id._desc_genc) { - /* Receive descriptor mismatch in time and space. - * Stale err chk gen, drop packet - */ - _HFI_DBG - ("ERR_CHK_GEN: gen mismatch Pkt: 0x%x, Current: 0x%x\n", - desc_id._desc_genc, tidrecvc->rdescid._desc_genc); - PSM2_LOG_MSG("leaving"); - return IPS_RECVHDRQ_CONTINUE; - } - psmi_assert(tidrecvc->state == TIDRECVC_STATE_BUSY); - - /* - * We change tidrecvc->tidflow_genseq here only when a new generation - * is allocated and programmed into hardware. Otherwise we use local - * variable recvseq to create the reply. - */ - recvseq = tidrecvc->tidflow_genseq; - - /* Get the latest seq from hardware tidflow table. But - * only do this when context sharing is not used, because - * context sharing might drop packet even though hardware - * has received it successfully. - */ - if (!tidrecvc->context->tf_ctrl) - { - uint64_t tf; - uint32_t seqno=0; - - psmi_hal_tidflow_get(tidrecvc->rdescid._desc_idx, &tf, - tidrecvc->context->psm_hw_ctxt); - psmi_hal_tidflow_get_seqnum(tf, &seqno); - recvseq.psn_seq = seqno; - } - - if (err_seqnum.psn_gen != recvseq.psn_gen) { - ack_type = OPCODE_NAK; - /* NAK without allocating a new generation */ - - /* My current generation and last received seq */ - ctrlscb.ips_lrh.data[1].u32w0 = recvseq.psn_val; - } else { - /* Either lost packets or lost ack, we need to deal - * with wrap around of the seq value from 2047 to 0 - * because seq is only 11 bits */ - seq_off = (int16_t)(err_seqnum.psn_seq - recvseq.psn_seq); - if (seq_off < 0) - seq_off += 2048; /* seq is 11 bits */ - - if (seq_off < 1024) { - ack_type = OPCODE_NAK; - /* NAK with allocating a new generation */ - - /* set latest seq */ - tidrecvc->tidflow_genseq.psn_seq = recvseq.psn_seq; - /* allocate and set a new generation */ - ips_protoexp_flow_newgen(tidrecvc); - /* get the new generation */ - recvseq.psn_gen = tidrecvc->tidflow_genseq.psn_gen; - - /* My new generation and last received seq */ - ctrlscb.ips_lrh.data[1].u32w0 = recvseq.psn_val; - } else - /* ACK with last received seq, - * no need to set ips_lrh.data[1].u32w0 */ - ack_type = OPCODE_ACK; - } - - ctrlscb.scb_flags = 0; - ctrlscb.ips_lrh.data[0].u64 = send_desc_id.u64; - /* Keep peer generation but use my last received sequence */ - err_seqnum.psn_seq = recvseq.psn_seq; - ctrlscb.ips_lrh.ack_seq_num = err_seqnum.psn_val; - - /* May want to generate a BECN if a lot of swapped generations */ - if_pf((tidrecvc->tidflow_nswap_gen > 4) && - (protoexp->proto->flags & IPS_PROTO_FLAG_CCA)) { - _HFI_CCADBG - ("ERR_CHK_GEN: Generating BECN. Number of swapped generations: %d.\n", - tidrecvc->tidflow_nswap_gen); - /* Mark flow to generate BECN in control packet */ - tidrecvc->tidflow.flags |= IPS_FLOW_FLAG_GEN_BECN; - - /* Update stats for congestion encountered */ - recvq->proto->epaddr_stats.congestion_pkts++; - } - - // no payload, pass cksum so non-NULL - psm3_ips_proto_send_ctrl_message(&tidrecvc->tidflow, - ack_type, &tidrecvc->ctrl_msg_queued, - &ctrlscb, ctrlscb.cksum, 0); - - /* Update stats for expected window */ - tidrecvc->stats.nErrChkReceived++; - if (ack_type == OPCODE_NAK) - tidrecvc->stats.nReXmit++; /* Update stats for retransmit (Sent a NAK) */ - - PSM2_LOG_MSG("leaving"); - return IPS_RECVHDRQ_CONTINUE; -} - -int -psm3_gen1_ips_ptl_process_becn(struct ips_recvhdrq_event *rcv_ev) -{ - struct ips_proto *proto = rcv_ev->proto; - struct ips_message_header *p_hdr = rcv_ev->p_hdr; - ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr; - int flowid = ips_proto_flowid(p_hdr); - struct ips_flow *flow; - - psmi_assert(flowid < EP_FLOW_LAST); - flow = &ipsaddr->flows[flowid]; - if ((flow->path->opa.pr_ccti + - proto->cace[flow->path->pr_sl].ccti_increase) <= proto->ccti_limit) { - ips_cca_adjust_rate(flow->path, - proto->cace[flow->path->pr_sl].ccti_increase); - /* Clear congestion event */ - rcv_ev->is_congested &= ~IPS_RECV_EVENT_BECN; - } - - return IPS_RECVHDRQ_CONTINUE; -} - -int psm3_gen1_ips_ptl_process_unknown(const struct ips_recvhdrq_event *rcv_ev) -{ - int opcode; - struct ips_proto *proto = rcv_ev->proto; - psm2_ep_t ep_err; - char *pkt_type; - - if (0 == psm3_ips_proto_process_unknown(rcv_ev, &opcode)) - return IPS_RECVHDRQ_CONTINUE; - - // truely an unknown remote node, psm3_ips_proto_process_unknown already - // did generic output and debug packet dumps - // now output the final HAL specific error message - psm3_gen1_ips_ptl_dump_err_stats(proto); - - /* Other messages are definitely crosstalk. */ - /* out-of-context expected messages are always fatal */ - if (psm3_gen1_rhf_get_rx_type(rcv_ev->gen1_rhf) == PSM3_GEN1_RHF_RX_TYPE_EXPECTED) { - ep_err = PSMI_EP_NORETURN; - pkt_type = "expected"; - } else if (psm3_gen1_rhf_get_rx_type(rcv_ev->gen1_rhf) == PSM3_GEN1_RHF_RX_TYPE_EAGER) { - ep_err = PSMI_EP_LOGEVENT; - pkt_type = "eager"; - } else { - ep_err = PSMI_EP_NORETURN; - pkt_type = "unknown"; - } - - /* At this point we are out of luck. */ - psm3_handle_error(ep_err, PSM2_EPID_NETWORK_ERROR, - "Received %s message(s) ptype=0x%x opcode=%x" - " from an unknown process", pkt_type, psm3_gen1_rhf_get_rx_type(rcv_ev->gen1_rhf), opcode); - - /* Always skip this packet unless the above call was a noreturn call */ - return IPS_RECVHDRQ_CONTINUE; -} - -/* decode RHF errors; only used one place now, may want more later */ -static void get_rhf_errstring(uint32_t err, char *msg, size_t len) -{ - *msg = '\0'; /* if no errors, and so don't need to check what's first */ - - if (err & PSM3_GEN1_RHF_ERR_ICRC) - strlcat(msg, "icrcerr ", len); - if (err & PSM3_GEN1_RHF_ERR_ECC) - strlcat(msg, "eccerr ", len); - if (err & PSM3_GEN1_RHF_ERR_LEN) - strlcat(msg, "lenerr ", len); - if (err & PSM3_GEN1_RHF_ERR_TID) - strlcat(msg, "tiderr ", len); - if (err & PSM3_GEN1_RHF_ERR_DC) - strlcat(msg, "dcerr ", len); - if (err & PSM3_GEN1_RHF_ERR_DCUN) - strlcat(msg, "dcuncerr ", len); - if (err & PSM3_GEN1_RHF_ERR_KHDRLEN) - strlcat(msg, "khdrlenerr ", len); -} - -/* get the error string as a number and a string */ -static void rhf_errnum_string(char *msg, size_t msglen, long err) -{ - int len; - char *errmsg; - - len = snprintf(msg, msglen, "RHFerror %lx: ", err); - if (len > 0 && len < msglen) { - errmsg = msg + len; - msglen -= len; - } else - errmsg = msg; - *errmsg = 0; - get_rhf_errstring(err, errmsg, msglen); -} - -static void -psm3_gen1_ptl_ips_protoexp_handle_tiderr(const struct ips_recvhdrq_event *rcv_ev) -{ - struct ips_tid_recv_desc *tidrecvc; - struct ips_protoexp *protoexp = rcv_ev->proto->protoexp; - struct ips_message_header *p_hdr = rcv_ev->p_hdr; - - ptl_arg_t desc_id; - int tidpair = (__le32_to_cpu(p_hdr->khdr.kdeth0) >> - HFI_KHDR_TID_SHIFT) & HFI_KHDR_TID_MASK; - int tidctrl = (__le32_to_cpu(p_hdr->khdr.kdeth0) >> - HFI_KHDR_TIDCTRL_SHIFT) & HFI_KHDR_TIDCTRL_MASK; - int tid0, tid1, tid; - - psmi_assert(_get_proto_hfi_opcode(p_hdr) == OPCODE_EXPTID); - - /* Expected sends not enabled */ - if (protoexp == NULL) - return; - - /* Not doing extra tid debugging or not really a tiderr */ - if (!(protoexp->tid_flags & IPS_PROTOEXP_FLAG_TID_DEBUG) || - !(psm3_gen1_rhf_get_all_err_flags(rcv_ev->gen1_rhf) & PSM3_GEN1_RHF_ERR_TID)) - return; - - if (psm3_gen1_rhf_get_rx_type(rcv_ev->gen1_rhf) != PSM3_GEN1_RHF_RX_TYPE_EXPECTED) { - _HFI_ERROR("receive type %d is not " - "expected in tid debugging\n", psm3_gen1_rhf_get_rx_type(rcv_ev->gen1_rhf)); - return; - } - - desc_id._desc_idx = ips_proto_flowid(p_hdr); - desc_id._desc_genc = p_hdr->exp_rdescid_genc; - - tidrecvc = &protoexp->tfc.tidrecvc[desc_id._desc_idx]; - - if (tidctrl != 3) - tid0 = tid1 = tidpair * 2 + tidctrl - 1; - else { - tid0 = tidpair * 2; - tid1 = tid0 + 1; - } - - for (tid = tid0; tid <= tid1; tid++) { - if (protoexp->tid_info[tid].state == TIDSTATE_USED) - continue; - - char buf[128]; - char *s = "invalid (not even in table)"; - - if (tidrecvc->rdescid._desc_genc == - desc_id._desc_genc) - s = "valid"; - else { - snprintf(buf, sizeof(buf) - 1, - "wrong generation (gen=%d,received=%d)", - tidrecvc->rdescid._desc_genc, - desc_id._desc_genc); - buf[sizeof(buf) - 1] = '\0'; - s = buf; - } - - if (protoexp->tid_info[tid].tidrecvc != tidrecvc) { - _HFI_ERROR - ("tid %d not a known member of tidsess %d\n", - tid, desc_id._desc_idx); - } - - _HFI_ERROR("tid %d is marked unused (session=%d): %s\n", tid, - desc_id._desc_idx, s); - } - return; -} - -static void -psm3_gen1_ptl_ips_protoexp_handle_data_err(const struct ips_recvhdrq_event *rcv_ev) -{ - struct ips_tid_recv_desc *tidrecvc; - struct ips_protoexp *protoexp = rcv_ev->proto->protoexp; - struct ips_message_header *p_hdr = rcv_ev->p_hdr; - int hdr_err = psm3_gen1_rhf_get_all_err_flags(rcv_ev->gen1_rhf) & PSM3_GEN1_RHF_ERR_KHDRLEN; - uint8_t op_code = _get_proto_hfi_opcode(p_hdr); - char pktmsg[128]; - char errmsg[256]; - - psmi_assert(_get_proto_hfi_opcode(p_hdr) == OPCODE_EXPTID); - - /* Expected sends not enabled */ - if (protoexp == NULL) - return; - - get_rhf_errstring(psm3_gen1_rhf_get_all_err_flags(rcv_ev->gen1_rhf), pktmsg, - sizeof(pktmsg)); - - snprintf(errmsg, sizeof(errmsg), - "%s pkt type opcode 0x%x at hd=0x%x %s\n", - (psm3_gen1_rhf_get_rx_type(rcv_ev->gen1_rhf) == PSM3_GEN1_RHF_RX_TYPE_EAGER) ? "Eager" : - (psm3_gen1_rhf_get_rx_type(rcv_ev->gen1_rhf) == PSM3_GEN1_RHF_RX_TYPE_EXPECTED) ? "Expected" : - (psm3_gen1_rhf_get_rx_type(rcv_ev->gen1_rhf) == PSM3_GEN1_RHF_RX_TYPE_NON_KD) ? "Non-kd" : - "", op_code, rcv_ev->recvq->state->hdrq_head, - pktmsg); - - if (!hdr_err) { - ptl_arg_t desc_id; - psmi_seqnum_t sequence_num; - - desc_id._desc_idx = ips_proto_flowid(p_hdr); - desc_id._desc_genc = p_hdr->exp_rdescid_genc; - - tidrecvc = &protoexp->tfc.tidrecvc[desc_id._desc_idx]; - - if (tidrecvc->rdescid._desc_genc != desc_id._desc_genc) { - /* Print this at very verbose level. Noisy links can have a few of - * these! */ - _HFI_VDBG - ("Data Error Pkt and Recv Generation Mismatch: %s", - errmsg); - return; /* skip */ - } - - if (tidrecvc->state == TIDRECVC_STATE_FREE) { - _HFI_EPDBG - ("Data Error Pkt for a Completed Rendezvous: %s", - errmsg); - return; /* skip */ - } - - /* See if CRC error for a previous packet */ - sequence_num.psn_val = __be32_to_cpu(p_hdr->bth[2]); - if (sequence_num.psn_gen == tidrecvc->tidflow_genseq.psn_gen) { - /* Try to recover the flow by restarting from previous known good - * sequence (possible if the packet with CRC error is after the "known - * good PSN" else we can't restart the flow. - */ - return ips_protoexp_do_tf_seqerr(protoexp, - tidrecvc, p_hdr); - } else { - /* Print this at very verbose level */ - _HFI_VDBG - ("Data Error Packet. GenMismatch: Yes. Tidrecvc: %p. " - "Pkt Gen.Seq: %d.%d, TF Gen.Seq: %d.%d. %s\n", - tidrecvc, sequence_num.psn_gen, - sequence_num.psn_seq, - tidrecvc->tidflow_genseq.psn_gen, - tidrecvc->tidflow_genseq.psn_seq, errmsg); - } - - } else { - _HFI_VDBG("HDR_ERROR: %s\n", errmsg); - } - -} - -static void -psm3_gen1_ptl_ips_protoexp_handle_tf_seqerr(const struct ips_recvhdrq_event *rcv_ev) -{ - struct ips_protoexp *protoexp = rcv_ev->proto->protoexp; - struct ips_message_header *p_hdr = rcv_ev->p_hdr; - struct ips_tid_recv_desc *tidrecvc; - ptl_arg_t desc_id; - - psmi_assert_always(protoexp != NULL); - psmi_assert(_get_proto_hfi_opcode(p_hdr) == OPCODE_EXPTID); - - desc_id._desc_idx = ips_proto_flowid(p_hdr); - desc_id._desc_genc = p_hdr->exp_rdescid_genc; - - tidrecvc = &protoexp->tfc.tidrecvc[desc_id._desc_idx]; - - if (tidrecvc->rdescid._desc_genc == desc_id._desc_genc - && tidrecvc->state == TIDRECVC_STATE_BUSY) - ips_protoexp_do_tf_seqerr(protoexp, tidrecvc, p_hdr); - - return; -} - -static void -psm3_gen1_ptl_ips_protoexp_handle_tf_generr(const struct ips_recvhdrq_event *rcv_ev) -{ - struct ips_protoexp *protoexp = rcv_ev->proto->protoexp; - struct ips_message_header *p_hdr = rcv_ev->p_hdr; - struct ips_tid_recv_desc *tidrecvc; - ptl_arg_t desc_id; - - psmi_assert_always(protoexp != NULL); - psmi_assert(_get_proto_hfi_opcode(p_hdr) == OPCODE_EXPTID); - - /* For a generation error our NAK crossed on the wire or this is a stale - * packet. Error recovery should sync things up again. Just drop this - * packet. - */ - desc_id._desc_idx = ips_proto_flowid(p_hdr); - desc_id._desc_genc = p_hdr->exp_rdescid_genc; - - tidrecvc = &protoexp->tfc.tidrecvc[desc_id._desc_idx]; - - if (tidrecvc->rdescid._desc_genc == desc_id._desc_genc - && tidrecvc->state == TIDRECVC_STATE_BUSY) - ips_protoexp_do_tf_generr(protoexp, tidrecvc, p_hdr); - - return; -} - -/* - * Error handling - */ -int psm3_gen1_ips_ptl_process_packet_error(struct ips_recvhdrq_event *rcv_ev) -{ - struct ips_proto *proto = rcv_ev->proto; - int pkt_verbose_err = psm3_dbgmask & __HFI_PKTDBG; - int tiderr = psm3_gen1_rhf_get_all_err_flags(rcv_ev->gen1_rhf) & PSM3_GEN1_RHF_ERR_TID; - int tf_seqerr = psm3_gen1_rhf_get_all_err_flags(rcv_ev->gen1_rhf) & PSM3_GEN1_RHF_ERR_TFSEQ; - int tf_generr = psm3_gen1_rhf_get_all_err_flags(rcv_ev->gen1_rhf) & PSM3_GEN1_RHF_ERR_TFGEN; - int data_err = psm3_gen1_rhf_get_all_err_flags(rcv_ev->gen1_rhf) & - (PSM3_GEN1_RHF_ERR_ICRC | PSM3_GEN1_RHF_ERR_ECC | PSM3_GEN1_RHF_ERR_LEN | - PSM3_GEN1_RHF_ERR_DC | PSM3_GEN1_RHF_ERR_DCUN | PSM3_GEN1_RHF_ERR_KHDRLEN); - char pktmsg[128]; - - *pktmsg = 0; - /* - * Tid errors on eager pkts mean we get a headerq overflow, perfectly - * safe. Tid errors on expected or other packets means trouble. - */ - if (tiderr && psm3_gen1_rhf_get_rx_type(rcv_ev->gen1_rhf) == PSM3_GEN1_RHF_RX_TYPE_EAGER) { - struct ips_message_header *p_hdr = rcv_ev->p_hdr; - - /* Payload dropped - Determine flow for this header and see if - * we need to generate a NAK. - * - * ALL PACKET DROPS IN THIS CATEGORY CAN BE FLAGGED AS DROPPED DUE TO - * CONGESTION AS THE EAGER BUFFER IS FULL. - * - * Possible eager packet type: - * - * Ctrl Message - ignore - * MQ message - Can get flow and see if we need to NAK. - * AM message - Can get flow and see if we need to NAK. - */ - - proto->stats.hdr_overflow++; - if (data_err) - return 0; - - switch (_get_proto_hfi_opcode(p_hdr)) { - case OPCODE_TINY: - case OPCODE_SHORT: - case OPCODE_EAGER: - case OPCODE_LONG_RTS: - case OPCODE_LONG_CTS: - case OPCODE_LONG_DATA: - case OPCODE_AM_REQUEST: - case OPCODE_AM_REQUEST_NOREPLY: - case OPCODE_AM_REPLY: - { - ips_epaddr_flow_t flowid = - ips_proto_flowid(p_hdr); - struct ips_epstate_entry *epstaddr; - struct ips_flow *flow; - psmi_seqnum_t sequence_num; - int16_t diff; - - /* Obtain ipsaddr for packet */ - epstaddr = - ips_epstate_lookup(rcv_ev->recvq->epstate, - rcv_ev->p_hdr->connidx); - if_pf(epstaddr == NULL - || epstaddr->ipsaddr == NULL) - return 0; /* Unknown packet - drop */ - - rcv_ev->ipsaddr = epstaddr->ipsaddr; - - psmi_assert(flowid < EP_FLOW_LAST); - flow = &rcv_ev->ipsaddr->flows[flowid]; - sequence_num.psn_val = - __be32_to_cpu(p_hdr->bth[2]); - diff = - (int16_t) (sequence_num.psn_num - - flow->recv_seq_num.psn_num); - - if (diff >= 0 - && !(flow-> - flags & IPS_FLOW_FLAG_NAK_SEND)) { - /* Mark flow as congested and attempt to generate NAK */ - flow->flags |= IPS_FLOW_FLAG_GEN_BECN; - proto->epaddr_stats.congestion_pkts++; - - flow->flags |= IPS_FLOW_FLAG_NAK_SEND; - flow->cca_ooo_pkts = 0; - ips_proto_send_nak((struct ips_recvhdrq - *)rcv_ev->recvq, - flow); - } - - /* Safe to process ACKs from header */ - psm3_ips_proto_process_ack(rcv_ev); - } - break; - case OPCODE_EXPTID: - /* If RSM is matching packets that are TID&FECN&SH, - * it is possible to have a EXPTID packet encounter - * the eager full condition and have the payload - * dropped (but the header delivered). - * Treat this condition as a data error (corruption,etc) - * and send a NAK. - */ - if (psmi_hal_has_cap(PSM_HAL_CAP_RSM_FECN_SUPP)) - psm3_gen1_ptl_ips_protoexp_handle_data_err(rcv_ev); - break; - default: - break; - } - } else if (tf_generr) /* handle generr, ignore tiderr if any */ - psm3_gen1_ptl_ips_protoexp_handle_tf_generr(rcv_ev); - else if (tf_seqerr) - psm3_gen1_ptl_ips_protoexp_handle_tf_seqerr(rcv_ev); - else if (tiderr) { /* tid error, but not on an eager pkt */ - psm2_ep_t ep_err = PSMI_EP_LOGEVENT; - uint16_t tid, offset; - uint64_t t_now = get_cycles(); - - proto->tiderr_cnt++; - - /* Whether and how we will be logging this event */ - if (proto->tiderr_max > 0 - && proto->tiderr_cnt >= proto->tiderr_max) - ep_err = PSMI_EP_NORETURN; - else if (proto->tiderr_warn_interval != UINT64_MAX && - proto->tiderr_tnext <= t_now) - proto->tiderr_tnext = - get_cycles() + proto->tiderr_warn_interval; - else - ep_err = NULL; - - if (ep_err != NULL) { - rhf_errnum_string(pktmsg, sizeof(pktmsg), - psm3_gen1_rhf_get_all_err_flags(rcv_ev->gen1_rhf)); - - tid = (__le32_to_cpu(rcv_ev->p_hdr->khdr.kdeth0) >> - HFI_KHDR_TID_SHIFT) & HFI_KHDR_TID_MASK; - offset = __le32_to_cpu(rcv_ev->p_hdr->khdr.kdeth0) & - HFI_KHDR_OFFSET_MASK; - - psm3_handle_error(ep_err, PSM2_EP_DEVICE_FAILURE, - "%s with tid=%d,offset=%d,count=%d: %s %s", - "TID Error", - tid, offset, proto->tiderr_cnt, - pktmsg, ep_err == PSMI_EP_NORETURN ? - "(Terminating...)" : ""); - } - - psm3_gen1_ptl_ips_protoexp_handle_tiderr(rcv_ev); - } else if (data_err) { -#if _HFI_DEBUGGING - if (_HFI_DBG_ON) { - uint8_t op_code - = _get_proto_hfi_opcode(rcv_ev->p_hdr); - - if (!pkt_verbose_err) { - rhf_errnum_string(pktmsg, sizeof(pktmsg), - psm3_gen1_rhf_get_all_err_flags(rcv_ev->gen1_rhf)); - _HFI_DBG_ALWAYS - ("Error %s pkt type opcode 0x%x at hd=0x%x %s\n", - (psm3_gen1_rhf_get_rx_type(rcv_ev->gen1_rhf) == - PSM3_GEN1_RHF_RX_TYPE_EAGER) ? "eager" : ( - psm3_gen1_rhf_get_rx_type(rcv_ev->gen1_rhf) == - PSM3_GEN1_RHF_RX_TYPE_EXPECTED) - ? "expected" : (psm3_gen1_rhf_get_rx_type(rcv_ev->gen1_rhf) == - PSM3_GEN1_RHF_RX_TYPE_NON_KD) ? "non-kd" : - "", op_code, - rcv_ev->recvq->state->hdrq_head, pktmsg); - } - } -#endif - - if (psm3_gen1_rhf_get_rx_type(rcv_ev->gen1_rhf) == PSM3_GEN1_RHF_RX_TYPE_EXPECTED) - psm3_gen1_ptl_ips_protoexp_handle_data_err(rcv_ev); - } else { /* not a tid or data error -- some other error */ -#if _HFI_DEBUGGING - if (_HFI_DBG_ON) { - uint8_t op_code = - __be32_to_cpu(rcv_ev->p_hdr->bth[0]) >> 24 & 0xFF; - - if (!pkt_verbose_err) - rhf_errnum_string(pktmsg, sizeof(pktmsg), - psm3_gen1_rhf_get_all_err_flags(rcv_ev->gen1_rhf)); - - /* else RHFerr decode printed below */ - _HFI_DBG_ALWAYS - ("Error pkt type 0x%x opcode 0x%x at hd=0x%x %s\n", - psm3_gen1_rhf_get_rx_type(rcv_ev->gen1_rhf), op_code, - rcv_ev->recvq->state->hdrq_head, pktmsg); - } -#endif - } - if (pkt_verbose_err) { - if (!*pktmsg) - rhf_errnum_string(pktmsg, sizeof(pktmsg), - psm3_gen1_rhf_get_all_err_flags(rcv_ev->gen1_rhf)); - psm3_ips_proto_show_header(rcv_ev->p_hdr, pktmsg); - } - - return 0; -} - -static void psm3_gen1_gen_ipd_table(struct ips_proto *proto) -{ - uint8_t delay = 0, step = 1; - /* Based on our current link rate setup the IPD table */ - memset(proto->ips_ipd_delay, 0xFF, sizeof(proto->ips_ipd_delay)); - - /* - * Based on the starting rate of the link, we let the code to - * fall through to next rate without 'break' in the code. The - * decrement is doubled at each rate level... - */ - switch (proto->epinfo.ep_link_rate) { - case PSM3_IBV_RATE_300_GBPS: - proto->ips_ipd_delay[PSM3_IBV_RATE_100_GBPS] = delay; - delay += step; - step *= 2; - case PSM3_IBV_RATE_200_GBPS: - proto->ips_ipd_delay[PSM3_IBV_RATE_100_GBPS] = delay; - delay += step; - step *= 2; - case PSM3_IBV_RATE_168_GBPS: - proto->ips_ipd_delay[PSM3_IBV_RATE_100_GBPS] = delay; - delay += step; - step *= 2; - case PSM3_IBV_RATE_120_GBPS: - proto->ips_ipd_delay[PSM3_IBV_RATE_100_GBPS] = delay; - case PSM3_IBV_RATE_112_GBPS: - proto->ips_ipd_delay[PSM3_IBV_RATE_100_GBPS] = delay; - case PSM3_IBV_RATE_100_GBPS: - proto->ips_ipd_delay[PSM3_IBV_RATE_100_GBPS] = delay; - delay += step; - step *= 2; - case PSM3_IBV_RATE_80_GBPS: - proto->ips_ipd_delay[PSM3_IBV_RATE_80_GBPS] = delay; - case PSM3_IBV_RATE_60_GBPS: - proto->ips_ipd_delay[PSM3_IBV_RATE_60_GBPS] = delay; - delay += step; - step *= 2; - case PSM3_IBV_RATE_40_GBPS: - proto->ips_ipd_delay[PSM3_IBV_RATE_40_GBPS] = delay; - case PSM3_IBV_RATE_30_GBPS: - proto->ips_ipd_delay[PSM3_IBV_RATE_30_GBPS] = delay; - delay += step; - step *= 2; - case PSM3_IBV_RATE_25_GBPS: - proto->ips_ipd_delay[PSM3_IBV_RATE_25_GBPS] = delay; - case PSM3_IBV_RATE_20_GBPS: - proto->ips_ipd_delay[PSM3_IBV_RATE_20_GBPS] = delay; - delay += step; - step *= 2; - case PSM3_IBV_RATE_10_GBPS: - proto->ips_ipd_delay[PSM3_IBV_RATE_10_GBPS] = delay; - case PSM3_IBV_RATE_5_GBPS: - proto->ips_ipd_delay[PSM3_IBV_RATE_5_GBPS] = delay; - default: - break; - } -} - -static psm2_error_t psm3_gen1_gen_cct_table(struct ips_proto *proto) -{ - psm2_error_t err = PSM2_OK; - uint32_t cca_divisor, ipdidx, ipdval = 1; - uint16_t *cct_table; - - /* The CCT table is static currently. If it's already created then return */ - if (proto->cct) - goto fail; - - /* Allocate the CCT table */ - cct_table = psmi_calloc(proto->ep, UNDEFINED, - proto->ccti_size, sizeof(uint16_t)); - if (!cct_table) { - err = PSM2_NO_MEMORY; - goto fail; - } - - if (proto->ccti_size) - { - /* The first table entry is always 0 i.e. no IPD delay */ - cct_table[0] = 0; - } - - /* Generate the remaining CCT table entries */ - for (ipdidx = 1; ipdidx < proto->ccti_size; ipdidx += 4, ipdval++) - for (cca_divisor = 0; cca_divisor < 4; cca_divisor++) { - if ((ipdidx + cca_divisor) == proto->ccti_size) - break; - cct_table[ipdidx + cca_divisor] = - (((cca_divisor ^ 0x3) << CCA_DIVISOR_SHIFT) | - (ipdval & 0x3FFF)); - _HFI_CCADBG("CCT[%d] = %x. Divisor: %x, IPD: %x\n", - ipdidx + cca_divisor, - cct_table[ipdidx + cca_divisor], - (cct_table[ipdidx + cca_divisor] >> - CCA_DIVISOR_SHIFT), - cct_table[ipdidx + - cca_divisor] & CCA_IPD_MASK); - } - - /* On link up/down CCT is re-generated. If CCT table is previously created - * free it - */ - if (proto->cct) { - psmi_free(proto->cct); - proto->cct = NULL; - } - - /* Update to the new CCT table */ - proto->cct = cct_table; - -fail: - return err; -} - -// Fetch current link state to update linkinfo fields in ips_proto: -// ep_base_lid, ep_lmc, ep_link_rate, QoS tables, CCA tables -// These are all fields which can change during a link bounce. -// Note "active" state is not adjusted as on link down PSM will wait for -// the link to become usable again so it's always a viable/active device -// afer initial PSM startup has selected devices. -// Called during initialization of ips_proto during ibta_init as well -// as during a link bounce. -// TBD - may be able to call this from HAL ips_proto_init as well as -// directly within HAL event processing, in which case this could -// be completely internal to HAL and not exposed in HAL API -psm2_error_t psm3_gen1_ptl_ips_update_linkinfo(struct ips_proto *proto) -{ - psm2_error_t err = PSM2_OK; - uint16_t lid; - int ret; - uint64_t link_speed; - - /* Get base lid, lmc and rate as these may have changed if the link bounced */ - // for Ethernet LID of 1 is returned - lid = psm3_epid_lid(proto->ep->context.epid); - proto->epinfo.ep_base_lid = __cpu_to_be16(lid); - - if ((ret = psm3_gen1_get_port_lmc(proto->ep->unit_id, - proto->ep->portnum)) < 0) { - err = psm3_handle_error(proto->ep, PSM2_EP_DEVICE_FAILURE, - "Could not obtain LMC for unit %u:%u. Error: %s", - proto->ep->unit_id, proto->ep->portnum, strerror(errno)); - goto fail; - } - proto->epinfo.ep_lmc = min(ret, IPS_MAX_PATH_LMC); - - if (psm3_hfp_gen1_get_port_speed(proto->ep->unit_id, - proto->ep->portnum, &link_speed) < - 0) { - err = - psm3_handle_error(proto->ep, PSM2_EP_DEVICE_FAILURE, - "Could obtain link rate for unit %u:%u. Error: %s", - proto->ep->unit_id, proto->ep->portnum, strerror(errno)); - goto fail; - } - proto->epinfo.ep_link_rate = ips_link_speed_to_enum(link_speed); - - /* Load the SL2SC2VL table */ - psm3_gen1_ips_ptl_init_sl2sc_table(proto); - - /* Regenerate new IPD table for the updated link rate. */ - psm3_gen1_gen_ipd_table(proto); - - /* Generate the CCT table. */ - err = psm3_gen1_gen_cct_table(proto); - -fail: - return err; -} - -#endif // PSM_OPA diff --git a/psm3/hal_gen1/gen1_ptl_ips_expected.c b/psm3/hal_gen1/gen1_ptl_ips_expected.c deleted file mode 100644 index a9e0e2a..0000000 --- a/psm3/hal_gen1/gen1_ptl_ips_expected.c +++ /dev/null @@ -1,89 +0,0 @@ -#ifdef PSM_OPA -/* - - This file is provided under a dual BSD/GPLv2 license. When using or - redistributing this file, you may do so under either license. - - GPL LICENSE SUMMARY - - Copyright(c) 2021 Intel Corporation. - - This program is free software; you can redistribute it and/or modify - it under the terms of version 2 of the GNU General Public License as - published by the Free Software Foundation. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - Contact Information: - Intel Corporation, www.intel.com - - BSD LICENSE - - Copyright(c) 2021 Intel Corporation. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -/* Copyright (c) 2003-2021 Intel Corporation. All rights reserved. */ - -/* This file implements the HAL specific code for PSM PTL for ips RDMA */ -#include "psm_user.h" -#include "psm2_hal.h" -#include "ptl_ips.h" -#include "psm_mq_internal.h" -#include "gen1_hal.h" - -// The value returned is a bitmask of IPS_PROTOEXP_FLAG_* selections -// When reload==1, we refetch the env variable and reload the cached value -// While this can also be used to set additional flags (TID_DEBUG, -// RTS_CTS_INTERLEAVE and CTS_SERIALIZED), it should not. -// TID_DEBUG and CTS_SERIALIZED are automatically set when appropriate, -// and there is an env variable for RTS_CTS_INTERLEAVE. -unsigned psm3_gen1_parse_tid(int reload) -{ - union psmi_envvar_val envval; - static int have_value = 0; - static unsigned saved; - - // only parse once so doesn't appear in PSM3_VERBOSE_ENV multiple times - if (!reload && have_value) - return saved; - - psm3_getenv("PSM3_TID", - "Tid proto flags (0 disables protocol)", - PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, - (union psmi_envvar_val)IPS_PROTOEXP_FLAG_TID, - &envval); - saved = envval.e_uint; - have_value = 1; - return saved; -} -#endif /* PSM_OPA */ diff --git a/psm3/hal_gen1/gen1_ptl_ips_subcontext.h b/psm3/hal_gen1/gen1_ptl_ips_subcontext.h deleted file mode 100644 index d03b0a7..0000000 --- a/psm3/hal_gen1/gen1_ptl_ips_subcontext.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifdef PSM_OPA -/* - - This file is provided under a dual BSD/GPLv2 license. When using or - redistributing this file, you may do so under either license. - - GPL LICENSE SUMMARY - - Copyright(c) 2015 Intel Corporation. - - This program is free software; you can redistribute it and/or modify - it under the terms of version 2 of the GNU General Public License as - published by the Free Software Foundation. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - Contact Information: - Intel Corporation, www.intel.com - - BSD LICENSE - - Copyright(c) 2015 Intel Corporation. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ - -#ifndef _GEN1_PTL_IPS_SUBCONTEXT_H -#define _GEN1_PTL_IPS_SUBCONTEXT_H - -#include -#include "ips_recvhdrq.h" -#include "gen1_ptl_ips_writehdrq.h" - -/* This data structure is allocated in ureg page of each subcontext process */ - -struct gen1_ips_subcontext_ureg { - /* head/eager head/tail register storage, one per cacheline - (member is unused by PSM, but needed here to match driver structures). */ - uint64_t subcontext_uregbase[40 /* i.e. ur_maxreg * 8 */]; - struct gen1_ips_writehdrq_state writeq_state; /* used in all ureg pages */ -} __attribute__ ((aligned(64))); - -struct gen1_ips_hwcontext_ctrl { - pthread_spinlock_t context_lock; /* lock shared by all subctxts */ - struct ips_recvhdrq_state recvq_state; /* state shared by all subctxts */ - uint32_t rx_hdrq_rhf_seq; /* rhf seq for the hw hdrq shared - by all subctxts */ -} __attribute__ ((aligned(64))); -#endif /* _GEN1_PTL_IPS_SUBCONTEXT_H */ - -#endif /* PSM_OPA */ diff --git a/psm3/hal_gen1/gen1_ptl_ips_writehdrq.h b/psm3/hal_gen1/gen1_ptl_ips_writehdrq.h deleted file mode 100644 index 8bdb1fd..0000000 --- a/psm3/hal_gen1/gen1_ptl_ips_writehdrq.h +++ /dev/null @@ -1,84 +0,0 @@ -#ifdef PSM_OPA -/* - - This file is provided under a dual BSD/GPLv2 license. When using or - redistributing this file, you may do so under either license. - - GPL LICENSE SUMMARY - - Copyright(c) 2015 Intel Corporation. - - This program is free software; you can redistribute it and/or modify - it under the terms of version 2 of the GNU General Public License as - published by the Free Software Foundation. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - Contact Information: - Intel Corporation, www.intel.com - - BSD LICENSE - - Copyright(c) 2015 Intel Corporation. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ - -#ifndef _GEN1_PTL_IPS_WRITEHDRQ_H -#define _GEN1_PTL_IPS_WRITEHDRQ_H - -#include "psm_user.h" -#include "ips_recvq.h" - -/* - * Structure containing state for writehdrq writing. This is logically - * part of ips_writehdrq but needs to be separated out for context - * sharing so that it can be put in a shared memory page and hence - * be available to all processes sharing the port. Generally, do not - * put pointers in here since the address map of each process can be - * different. - */ -struct gen1_ips_writehdrq_state { - uint32_t hdrq_rhf_seq; /* last seq */ - uint32_t egrq_offset; /* in bytes unit, not 64B */ - uint32_t enabled; /* enables writing */ -}; - -struct gen1_ips_writehdrq { - const psmi_context_t *context; - struct gen1_ips_writehdrq_state *state; - uint32_t hdrq_elemlast; -}; - -#endif /* _GEN1_PTL_IPS_WRITEHDRQ_H */ -#endif /* PSM_OPA */ diff --git a/psm3/hal_gen1/gen1_rcvthread.c b/psm3/hal_gen1/gen1_rcvthread.c deleted file mode 100644 index 495c2c3..0000000 --- a/psm3/hal_gen1/gen1_rcvthread.c +++ /dev/null @@ -1,193 +0,0 @@ -#ifdef PSM_OPA -/* - - This file is provided under a dual BSD/GPLv2 license. When using or - redistributing this file, you may do so under either license. - - GPL LICENSE SUMMARY - - Copyright(c) 2021 Intel Corporation. - - This program is free software; you can redistribute it and/or modify - it under the terms of version 2 of the GNU General Public License as - published by the Free Software Foundation. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - Contact Information: - Intel Corporation, www.intel.com - - BSD LICENSE - - Copyright(c) 2021 Intel Corporation. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -/* Copyright (c) 2003-2021 Intel Corporation. All rights reserved. */ - -#include - -#include "psm_user.h" -#include "psm2_hal.h" -#include "psm_mq_internal.h" -#include "ptl_ips.h" -#include "ips_proto.h" -#include "gen1_hal.h" - -/* - * Receiver thread support. - * - * By default, polling in the driver asks the chip to generate an interrupt on - * every packet. When the driver supports POLLURG we can switch the poll mode - * to one that requests interrupts only for packets that contain an urgent bit - * (and optionally enable interrupts for hdrq overflow events). When poll - * returns an event, we *try* to make progress on the receive queue but simply - * go back to sleep if we notice that the main thread is already making - * progress. - * - * returns: - * PSM2_IS_FINALIZED - fd_pipe was closed, caller can exit rcvthread - * PSM2_NO_PROGRESS - got an EINTR, need to be called again with same - * next_timeout value - * PSM2_TIMEOUT - poll waited full timeout, no events - * PSM2_OK - poll found an event and processed it - * PSM2_INTERNAL_ERR - unexpected error attempting poll() - * updates counters: pollok (poll's which made progress), pollcyc (time spent - * polling without finding any events) - */ -psm2_error_t psm3_gen1_ips_ptl_pollintr(psm2_ep_t ep, - struct ips_recvhdrq *recvq, int fd_pipe, int next_timeout, - uint64_t *pollok, uint64_t *pollcyc) -{ - struct pollfd pfd[2]; - int ret; - uint64_t t_cyc; - psm2_error_t err; - - // pfd[0] is for urgent inbound packets (NAK, urgent ACK, etc) - // pfd[1] is for rcvthread termination - // on timeout (poll() returns 0), we do background process checks - // for non urgent inbound packets - pfd[0].fd = psm3_gen1_get_fd(ep->context.psm_hw_ctxt); - pfd[0].events = POLLIN; - pfd[0].revents = 0; - pfd[1].fd = fd_pipe; - pfd[1].events = POLLIN; - pfd[1].revents = 0; - ret = poll(pfd, 2, next_timeout); - t_cyc = get_cycles(); - if_pf(ret < 0) { - if (errno == EINTR) { - _HFI_DBG("got signal, keep polling\n"); - return PSM2_OK_NO_PROGRESS; - } else { - psm3_handle_error(PSMI_EP_NORETURN, - PSM2_INTERNAL_ERR, - "Receive thread poll() error: %s", - strerror(errno)); - return PSM2_INTERNAL_ERR; - } - } else if (pfd[1].revents) { - /* Any type of event on this fd means exit, should be POLLHUP */ - _HFI_DBG("close thread: revents=0x%x\n", pfd[1].revents); - close(fd_pipe); - return PSM2_IS_FINALIZED; - } else { - if (!PSMI_LOCK_TRY(psm3_creation_lock)) { - if (ret == 0 || pfd[0].revents & (POLLIN | POLLERR)) { - if (PSMI_LOCK_DISABLED) { - // this path is not supported. having rcvthread - // and PSMI_PLOCK_IS_NOLOCK define not allowed. - /* We do this check without acquiring the lock, no sense - * adding the overhead and it doesn't matter if we're - * wrong. */ - if (psm3_gen1_recvhdrq_isempty(recvq)) - return PSM2_OK; - if(recvq->proto->flags & IPS_PROTO_FLAG_CCA_PRESCAN) { - psm3_gen1_recvhdrq_scan_cca(recvq); - } - if (!ips_recvhdrq_trylock(recvq)) - return PSM2_OK; - err = psm3_gen1_recvhdrq_progress(recvq); - if (err == PSM2_OK) - (*pollok)++; - else - (*pollcyc) += get_cycles() - t_cyc; - ips_recvhdrq_unlock(recvq); - } else { - - ep = psm3_opened_endpoint; - - if (!PSMI_LOCK_TRY(ep->mq->progress_lock)) { - if(recvq->proto->flags & IPS_PROTO_FLAG_CCA_PRESCAN ) { - psm3_gen1_recvhdrq_scan_cca(recvq); - } - PSMI_UNLOCK(ep->mq->progress_lock); - } - /* Go through all master endpoints. */ - do{ - if (!PSMI_LOCK_TRY(ep->mq->progress_lock)) { - /* If we time out, we service shm and NIC. - * If not, we assume to have received an urgent - * packet and service only NIC. - */ - err = psm3_poll_internal(ep, - ret == 0 ? PSMI_TRUE : PSMI_FALSE); -#ifdef PSM_HAVE_REG_MR -#ifdef UMR_CACHE - if (ep->mr_cache_mode == MR_CACHE_MODE_USER && !ep->verbs_ep.umrc.thread) - psm3_gen1_poll_uffd_events(ep); -#endif -#endif - if (err == PSM2_OK) - (*pollok)++; - else - (*pollcyc) += get_cycles() - t_cyc; - PSMI_UNLOCK(ep->mq->progress_lock); - } - - /* get next endpoint from multi endpoint list */ - ep = ep->user_ep_next; - } while(NULL != ep); - } - } - PSMI_UNLOCK(psm3_creation_lock); - } - if (ret == 0) - /* timed out poll */ - return PSM2_TIMEOUT; - else - /* found work to do */ - return PSM2_OK; - } -} -#endif /* PSM_OPA */ diff --git a/psm3/hal_gen1/gen1_recvhdrq.c b/psm3/hal_gen1/gen1_recvhdrq.c deleted file mode 100644 index 86e9e8c..0000000 --- a/psm3/hal_gen1/gen1_recvhdrq.c +++ /dev/null @@ -1,755 +0,0 @@ -#ifdef PSM_OPA -/* - - This file is provided under a dual BSD/GPLv2 license. When using or - redistributing this file, you may do so under either license. - - GPL LICENSE SUMMARY - - Copyright(c) 2021 Intel Corporation. - - This program is free software; you can redistribute it and/or modify - it under the terms of version 2 of the GNU General Public License as - published by the Free Software Foundation. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - Contact Information: - Intel Corporation, www.intel.com - - BSD LICENSE - - Copyright(c) 2021 Intel Corporation. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -/* Copyright (c) 2003-2021 Intel Corporation. All rights reserved. */ - -#include "psm_user.h" -#include "psm2_hal.h" - -#include "ips_epstate.h" -#include "ips_proto.h" -#include "ips_expected_proto.h" -#include "ips_proto_help.h" -#include "ips_proto_internal.h" -#include "gen1_hal.h" - -/* - * Receive header queue initialization. - */ -psm2_error_t -psm3_gen1_recvhdrq_init(const psmi_context_t *context, - const struct ips_epstate *epstate, - const struct ips_proto *proto, - const struct ips_recvhdrq_callbacks *callbacks, - uint32_t subcontext, - struct ips_recvhdrq *recvq - , struct ips_recvhdrq_state *recvq_state, - psm3_gen1_cl_q gen1_cl_hdrq - ) -{ - psm2_error_t err = PSM2_OK; - - memset(recvq, 0, sizeof(*recvq)); - recvq->proto = (struct ips_proto *)proto; - recvq->context = context; - recvq->subcontext = subcontext; - recvq->state = recvq_state; - recvq->gen1_cl_hdrq = gen1_cl_hdrq; - pthread_spin_init(&recvq->hdrq_lock, PTHREAD_PROCESS_SHARED); - recvq->hdrq_elemlast = ((psm3_gen1_get_rx_hdr_q_cnt(context->psm_hw_ctxt) - 1) * - (psm3_gen1_get_rx_hdr_q_ent_size(context->psm_hw_ctxt) >> BYTE2DWORD_SHIFT)); - - recvq->epstate = epstate; - recvq->recvq_callbacks = *callbacks; /* deep copy */ - SLIST_INIT(&recvq->pending_acks); - - recvq->state->hdrq_head = 0; - recvq->state->rcv_egr_index_head = NO_EAGER_UPDATE; - recvq->state->num_hdrq_done = 0; - recvq->state->num_egrq_done = 0; - recvq->state->hdr_countdown = 0; - recvq->state->hdrq_cachedlastscan = 0; - - { - union psmi_envvar_val env_hdr_update; - psm3_getenv("PSM3_HEAD_UPDATE", - "header queue update interval (0 to update after all entries are processed). Default is 64", - PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, - (union psmi_envvar_val) 64, &env_hdr_update); - - /* Cap max header update interval to size of header/eager queue */ - recvq->state->head_update_interval = - min(env_hdr_update.e_uint, psm3_gen1_get_rx_hdr_q_cnt(context->psm_hw_ctxt) - 1); - recvq->state->egrq_update_interval = 1; - } - return err; -} - - -/* flush the eager buffers, by setting the eager index head to eager index tail - if eager buffer queue is full. - - Called when we had eager buffer overflows (ERR_TID/HFI_RHF_H_TIDERR - was set in RHF errors), and no good eager packets were received, so - that eager head wasn't advanced. -*/ -#if 0 -static void psm3_gen1_flush_egrq_if_required(struct ips_recvhdrq *recvq) -{ - const uint32_t tail = ips_recvq_tail_get(&recvq->egrq); - const uint32_t head = ips_recvq_head_get(&recvq->egrq); - uint32_t egr_cnt = recvq->egrq.elemcnt; - - if ((head % egr_cnt) == ((tail + 1) % egr_cnt)) { - _HFI_DBG("eager array full after overflow, flushing " - "(head %llx, tail %llx)\n", - (long long)head, (long long)tail); - recvq->proto->stats.egr_overflow++; - } - return; -} -#endif - -/* - * Helpers for recvhdrq_progress. - */ - -static __inline__ int -_get_proto_subcontext(const struct ips_message_header *p_hdr) -{ - return ((__be32_to_cpu(p_hdr->bth[1]) >> - HFI_BTH_SUBCTXT_SHIFT) & HFI_BTH_SUBCTXT_MASK); -} - -static __inline__ void _dump_invalid_pkt(struct ips_recvhdrq_event *rcv_ev) -{ - uint8_t *payload = ips_recvhdrq_event_payload(rcv_ev); - uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev) + - ((__be32_to_cpu(rcv_ev->p_hdr->bth[0]) >> 20) & 3); - -#ifdef PSM_DEBUG - psm3_ips_proto_show_header((struct ips_message_header *) - rcv_ev->p_hdr, "received invalid pkt"); -#endif - if (psm3_dbgmask & __HFI_PKTDBG) { - psm3_ips_proto_dump_frame(rcv_ev->p_hdr, HFI_MESSAGE_HDR_SIZE, - "header"); - if (!payload) { - _HFI_DBG("Cannot dump frame; payload is NULL\n"); - } else if (paylen) { - psm3_ips_proto_dump_frame(payload, paylen, "data"); - } - } - -} - -static __inline__ void -_update_error_stats(struct ips_proto *proto, uint32_t err) -{ - if (err & PSM3_GEN1_RHF_ERR_ICRC) - proto->error_stats.num_icrc_err++; - if (err & PSM3_GEN1_RHF_ERR_ECC) - proto->error_stats.num_ecc_err++; - if (err & PSM3_GEN1_RHF_ERR_LEN) - proto->error_stats.num_len_err++; - if (err & PSM3_GEN1_RHF_ERR_TID) - proto->error_stats.num_tid_err++; - if (err & PSM3_GEN1_RHF_ERR_DC) - proto->error_stats.num_dc_err++; - if (err & PSM3_GEN1_RHF_ERR_DCUN) - proto->error_stats.num_dcunc_err++; - if (err & PSM3_GEN1_RHF_ERR_KHDRLEN) - proto->error_stats.num_khdrlen_err++; -} - -#ifdef PSM_DEBUG - -static int _check_headers(struct ips_recvhdrq_event *rcv_ev, psm3_gen1_cl_q cl_q) -{ - struct ips_recvhdrq *recvq = (struct ips_recvhdrq *)rcv_ev->recvq; - struct ips_proto *proto = rcv_ev->proto; - uint32_t *lrh = (uint32_t *) rcv_ev->p_hdr; - uint32_t dest_context; - const uint16_t pkt_dlid = __be16_to_cpu(rcv_ev->p_hdr->lrh[1]); - const uint16_t base_dlid = - __be16_to_cpu(recvq->proto->epinfo.ep_base_lid); - - /* Check that the receive header queue entry has a sane sequence number */ - if (psm3_gen1_check_rhf_sequence_number(psm3_gen1_rhf_get_seq(rcv_ev->gen1_rhf)) - != PSM_HAL_ERROR_OK) { - unsigned int seqno=0; - - psm3_gen1_get_rhf_expected_sequence_number(&seqno, cl_q, recvq->context->psm_hw_ctxt); - psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, - "ErrPkt: Invalid header queue entry! RHF Sequence in Hdrq Seq: %d, Recvq State Seq: %d. LRH[0]: 0x%08x, LRH[1] (PktCount): 0x%08x\n", - psm3_gen1_rhf_get_seq(rcv_ev->gen1_rhf), - seqno, lrh[0], lrh[1]); - return -1; - } - - /* Verify that the packet was destined for our context */ - dest_context = ips_proto_dest_context_from_header(proto, rcv_ev->p_hdr); - if_pf(dest_context != recvq->proto->epinfo.ep_context) { - - struct ips_recvhdrq_state *state = recvq->state; - - /* Packet not targeted at us. Drop packet and continue */ - psm3_gen1_ips_ptl_dump_err_stats(proto); - _dump_invalid_pkt(rcv_ev); - - psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, - "ErrPkt: Received packet for context %d on context %d. Receive Header Queue offset: 0x%x. Exiting.\n", - dest_context, recvq->proto->epinfo.ep_context, - state->hdrq_head); - - return -1; - } - - /* Verify that rhf packet length matches the length in LRH */ - if_pf(psm3_gen1_rhf_get_packet_length(rcv_ev->gen1_rhf) != - ips_proto_lrh2_be_to_bytes(proto, rcv_ev->p_hdr->lrh[2])) { - _HFI_EPDBG - ("ErrPkt: RHF Packet Len (0x%x) does not match LRH (0x%x).\n", - psm3_gen1_rhf_get_packet_length(rcv_ev->gen1_rhf) >> 2, - __be16_to_cpu(rcv_ev->p_hdr->lrh[2])); - - psm3_gen1_ips_ptl_dump_err_stats(proto); - _dump_invalid_pkt(rcv_ev); - return -1; - } - - /* Verify that the DLID matches our local LID. */ - if_pf(!((base_dlid <= pkt_dlid) && - (pkt_dlid <= - (base_dlid + (1 << recvq->proto->epinfo.ep_lmc))))) { - _HFI_EPDBG - ("ErrPkt: DLID in LRH (0x%04x) does not match local LID (0x%04x) Skipping packet!\n", - rcv_ev->p_hdr->lrh[1], recvq->proto->epinfo.ep_base_lid); - psm3_gen1_ips_ptl_dump_err_stats(proto); - _dump_invalid_pkt(rcv_ev); - return -1; - } - - return 0; -} -#endif /* PSM_DEBUG */ - -static __inline__ int do_pkt_cksum(struct ips_recvhdrq_event *rcv_ev) -{ - uint8_t *payload = ips_recvhdrq_event_payload(rcv_ev); - uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev) + - ((__be32_to_cpu(rcv_ev->p_hdr->bth[0]) >> 20) & 3); - uint32_t *ckptr; - uint32_t recv_cksum, cksum, dest_subcontext; - /* With checksum every packet has a payload */ - psmi_assert_always(payload); - - ckptr = (uint32_t *) (payload + paylen); - recv_cksum = ckptr[0]; - - cksum = psm3_ips_cksum_calculate(rcv_ev->p_hdr, payload, paylen); - - if ((cksum != recv_cksum) || (ckptr[0] != ckptr[1])) { - struct ips_epstate_entry *epstaddr; - uint32_t lcontext; - psm3_gen1_cl_idx hd, tl; - - epstaddr = - ips_epstate_lookup(rcv_ev->recvq->epstate, - rcv_ev->p_hdr->connidx); - epstaddr = (epstaddr && epstaddr->ipsaddr) ? epstaddr : NULL; - lcontext = epstaddr ? rcv_ev->proto->epinfo.ep_context : -1; - - hd = psm3_gen1_get_cl_q_head_index(PSM3_GEN1_CL_Q_RX_HDR_Q, - rcv_ev->recvq->context->psm_hw_ctxt); - tl = psm3_gen1_get_cl_q_tail_index(PSM3_GEN1_CL_Q_RX_HDR_Q, - rcv_ev->recvq->context->psm_hw_ctxt); - - dest_subcontext = _get_proto_subcontext(rcv_ev->p_hdr); - - _HFI_ERROR - ("ErrPkt: SharedContext: %s. Local Context: %i, Checksum mismatch from LID %d! Received Checksum: 0x%08x, Expected: 0x%08x & 0x%08x. Opcode: 0x%08x, Error Flag: 0x%08x. hdrq hd 0x%x tl 0x%x rhf 0x%" - PRIx64 ", rhfseq 0x%x\n", - (dest_subcontext != - rcv_ev->recvq->subcontext) ? "Yes" : "No", lcontext, - epstaddr ? __be16_to_cpu(epstaddr->ipsaddr->pathgrp-> - pg_base_dlid) : -1, cksum, - ckptr[0], ckptr[1], _get_proto_hfi_opcode(rcv_ev->p_hdr), - psm3_gen1_rhf_get_all_err_flags(rcv_ev->gen1_rhf), hd, tl, rcv_ev->gen1_rhf.raw_rhf, - psm3_gen1_rhf_get_seq(rcv_ev->gen1_rhf)); - /* Dump packet */ - _dump_invalid_pkt(rcv_ev); - return 0; /* Packet checksum error */ - } - - return 1; -} - -/* receive service routine for each packet opcode starting at - * OPCODE_RESERVED (C0) - */ -ips_packet_service_fn_t -psm3_gen1_packet_service_routines[] = { -psm3_ips_proto_process_unknown_opcode, /* 0xC0 */ -psm3_ips_proto_mq_handle_tiny, /* OPCODE_TINY */ -psm3_ips_proto_mq_handle_short, /* OPCODE_SHORT */ -psm3_ips_proto_mq_handle_eager, /* OPCODE_EAGER */ -psm3_ips_proto_mq_handle_rts, /* OPCODE_LONG_RTS */ -psm3_ips_proto_mq_handle_cts, /* OPCODE_LONG_CTS */ -psm3_ips_proto_mq_handle_data, /* OPCODE_LONG_DATA */ -ips_protoexp_data, /* OPCODE_EXPTID */ -ips_protoexp_recv_tid_completion, /* OPCODE_EXPTID_COMPLETION */ - -/* these are control packets */ -psm3_ips_proto_process_ack, /* OPCODE_ACK */ -psm3_ips_proto_process_nak, /* OPCODE_NAK */ -psm3_gen1_ips_ptl_process_becn, /* OPCODE_BECN */ -psm3_ips_proto_process_err_chk, /* OPCODE_ERR_CHK */ -psm3_gen1_ips_ptl_process_err_chk_gen, /* OPCODE_ERR_CHK_GEN */ -psm3_ips_proto_connect_disconnect, /* OPCODE_CONNECT_REQUEST */ -psm3_ips_proto_connect_disconnect, /* OPCODE_CONNECT_REPLY */ -psm3_ips_proto_connect_disconnect, /* OPCODE_DISCONNECT__REQUEST */ -psm3_ips_proto_connect_disconnect, /* OPCODE_DISCONNECT_REPLY */ - -/* rest are not control packets */ -psm3_ips_proto_am, /* OPCODE_AM_REQUEST_NOREPLY */ -psm3_ips_proto_am, /* OPCODE_AM_REQUEST */ -psm3_ips_proto_am /* OPCODE_AM_REPLY */ - -/* D5-DF (OPCODE_FUTURE_FROM to OPCODE_FUTURE_TO) reserved for expansion */ -}; - -/* - * Core receive progress function - * - * recvhdrq_progress is the core function that services the receive header - * queue and optionally, the eager queue. At the lowest level, it identifies - * packets marked with errors by the chip and also detects and corrects when - * eager overflow conditions occur. At the highest level, it queries the - * 'epstate' interface to classify packets from "known" and "unknown" - * endpoints. In order to support shared contexts, it can also handle packets - * destined for other contexts (or "subcontexts"). - */ -psm2_error_t psm3_gen1_recvhdrq_progress(struct ips_recvhdrq *recvq) -{ - GENERIC_PERF_BEGIN(PSM_RX_SPEEDPATH_CTR); /* perf stats */ - struct ips_recvhdrq_state *state = recvq->state; - PSMI_CACHEALIGN struct ips_recvhdrq_event rcv_ev = {.proto = - recvq->proto, - .recvq = recvq - }; - struct ips_epstate_entry *epstaddr; - uint32_t num_hdrq_done = 0; - const uint32_t num_hdrq_todo = psm3_gen1_get_rx_hdr_q_cnt(recvq->context->psm_hw_ctxt); - uint32_t dest_subcontext; - const uint32_t hdrq_elemsz = psm3_gen1_get_rx_hdr_q_ent_size(recvq->context->psm_hw_ctxt) >> BYTE2DWORD_SHIFT; - int ret = IPS_RECVHDRQ_CONTINUE; - int done = 0, empty = 0; - int do_hdr_update = 0; - const psm3_gen1_cl_q gen1_hdr_q = recvq->gen1_cl_hdrq; - const psm3_gen1_cl_q psm_hal_egr_q = gen1_hdr_q + 1; - - /* Returns whether the currently set 'rcv_hdr'/head is a readable entry */ -#define next_hdrq_is_ready() (! empty ) - - if (psm3_gen1_cl_q_empty(state->hdrq_head, gen1_hdr_q, recvq->context->psm_hw_ctxt)) - return PSM2_OK; - - PSM2_LOG_MSG("entering"); - - done = !next_hdrq_is_ready(); - - rcv_ev.gen1_hdr_q = gen1_hdr_q; - - while (!done) { - psm3_gen1_get_receive_event(state->hdrq_head, recvq->context->psm_hw_ctxt, 1, - &rcv_ev); - _HFI_VDBG - ("new packet: rcv_hdr %p, rhf %" PRIx64 "\n", - rcv_ev.p_hdr, rcv_ev.gen1_rhf.raw_rhf); - -#ifdef PSM_DEBUG - if_pf(_check_headers(&rcv_ev, gen1_hdr_q)) - goto skip_packet; -#endif - dest_subcontext = _get_proto_subcontext(rcv_ev.p_hdr); - - /* If the destination is not our subcontext, process - * message as subcontext message (shared contexts) */ - if (dest_subcontext != recvq->subcontext) { - rcv_ev.ipsaddr = NULL; - - ret = recvq->recvq_callbacks.callback_subcontext - (&rcv_ev, dest_subcontext); - if (ret == IPS_RECVHDRQ_REVISIT) - { - // try processing on next progress call - PSM2_LOG_MSG("leaving"); - GENERIC_PERF_END(PSM_RX_SPEEDPATH_CTR); /* perf stats */ - return PSM2_OK_NO_PROGRESS; - } - - goto skip_packet; - } - - if_pf(psm3_gen1_rhf_get_all_err_flags(rcv_ev.gen1_rhf)) { - - _update_error_stats(recvq->proto, psm3_gen1_rhf_get_all_err_flags(rcv_ev.gen1_rhf)); - - recvq->recvq_callbacks.callback_error(&rcv_ev); - - if ((psm3_gen1_rhf_get_rx_type(rcv_ev.gen1_rhf) != PSM3_GEN1_RHF_RX_TYPE_EAGER) || - (!(psm3_gen1_rhf_get_all_err_flags(rcv_ev.gen1_rhf) & PSM3_GEN1_RHF_ERR_TID))) - goto skip_packet; - - /* no pending eager update, header - * is not currently under tracing. */ - if (state->hdr_countdown == 0 && - state->rcv_egr_index_head == NO_EAGER_UPDATE) { - uint32_t egr_cnt = psm3_gen1_get_rx_egr_tid_cnt(recvq->context->psm_hw_ctxt); - psm3_gen1_cl_idx etail=0, ehead=0; - - ehead = psm3_gen1_get_cl_q_head_index( - psm_hal_egr_q, - rcv_ev.recvq->context->psm_hw_ctxt); - etail = psm3_gen1_get_cl_q_tail_index( - psm_hal_egr_q, - rcv_ev.recvq->context->psm_hw_ctxt); - if (ehead == ((etail + 1) % egr_cnt)) { - /* eager is full, - * trace existing header entries */ - uint32_t hdr_size = - recvq->hdrq_elemlast + - hdrq_elemsz; - psm3_gen1_cl_idx htail=0; - - htail = psm3_gen1_get_cl_q_tail_index( - gen1_hdr_q, - rcv_ev.recvq->context->psm_hw_ctxt); - const uint32_t hhead = state->hdrq_head; - - state->hdr_countdown = - (htail > hhead) ? - (htail - hhead) : - (htail + hdr_size - hhead); - } - } - - /* Eager packet and tiderr. - * Don't consider updating egr head, unless we're in - * the congested state. If we're congested, we should - * try to keep the eager buffers free. */ - - if (!rcv_ev.is_congested) - goto skip_packet_no_egr_update; - else - goto skip_packet; - } - - /* If checksum is enabled, verify that it is valid */ - if_pf(rcv_ev.has_cksum && !do_pkt_cksum(&rcv_ev)) - goto skip_packet; - - if (_HFI_VDBG_ON) - { - psm3_gen1_cl_idx egr_buff_q_head, egr_buff_q_tail; - - egr_buff_q_head = psm3_gen1_get_cl_q_head_index( - psm_hal_egr_q, - rcv_ev.recvq->context->psm_hw_ctxt); - egr_buff_q_tail = psm3_gen1_get_cl_q_tail_index( - psm_hal_egr_q, - rcv_ev.recvq->context->psm_hw_ctxt); - - _HFI_VDBG_ALWAYS( - "hdrq_head %d, p_hdr: %p, opcode %x, payload %p paylen %d; " - "egrhead %x egrtail %x; " - "useegrbit %x egrindex %x, egroffset %x, egrindexhead %x\n", - state->hdrq_head, - rcv_ev.p_hdr, - _get_proto_hfi_opcode(rcv_ev.p_hdr), - ips_recvhdrq_event_payload(&rcv_ev), - ips_recvhdrq_event_paylen(&rcv_ev), - egr_buff_q_head,egr_buff_q_tail, - psm3_gen1_rhf_get_use_egr_buff(rcv_ev.gen1_rhf), - psm3_gen1_rhf_get_egr_buff_index(rcv_ev.gen1_rhf), - psm3_gen1_rhf_get_egr_buff_offset(rcv_ev.gen1_rhf), - state->rcv_egr_index_head); - } - - PSM2_LOG_PKT_STRM(PSM2_LOG_RX,rcv_ev.p_hdr,&rcv_ev.gen1_rhf.raw_rhf, - "PKT_STRM:"); - - /* Classify packet from a known or unknown endpoint */ - epstaddr = ips_epstate_lookup(recvq->epstate, - rcv_ev.p_hdr->connidx); - if_pf((epstaddr == NULL) || (epstaddr->ipsaddr == NULL)) { - rcv_ev.ipsaddr = NULL; - recvq->recvq_callbacks. - callback_packet_unknown(&rcv_ev); - } else { - rcv_ev.ipsaddr = epstaddr->ipsaddr; - psmi_assert(PSMI_HOWMANY(psm3_gen1_packet_service_routines) - == OPCODE_FUTURE_FROM - OPCODE_RESERVED); - ret = ips_proto_process_packet(&rcv_ev, - psm3_gen1_packet_service_routines); - if (ret == IPS_RECVHDRQ_REVISIT) - { - // try processing on next progress call - PSM2_LOG_MSG("leaving"); - GENERIC_PERF_END(PSM_RX_SPEEDPATH_CTR); /* perf stats */ - return PSM2_OK_NO_PROGRESS; - } - } - -skip_packet: - /* - * if eager buffer is used, record the index. - */ - if (psm3_gen1_rhf_get_use_egr_buff(rcv_ev.gen1_rhf)) { - /* set only when a new entry is used */ - if (psm3_gen1_rhf_get_egr_buff_offset(rcv_ev.gen1_rhf) == 0) { - state->rcv_egr_index_head = - psm3_gen1_rhf_get_egr_buff_index(rcv_ev.gen1_rhf); - state->num_egrq_done++; - } - /* a header entry is using an eager entry, stop tracing. */ - state->hdr_countdown = 0; - } - -skip_packet_no_egr_update: - /* Note that state->hdrq_head is sampled speculatively by the code - * in psm3_gen1_ips_ptl_shared_poll() when context sharing, so it is not safe - * for this shared variable to temporarily exceed the last element. */ - _HFI_VDBG - ("head %d, elemsz %d elemlast %d\n", - state->hdrq_head, hdrq_elemsz, - recvq->hdrq_elemlast); - psm3_gen1_retire_hdr_q_entry(&state->hdrq_head, gen1_hdr_q, - recvq->context->psm_hw_ctxt, - hdrq_elemsz, recvq->hdrq_elemlast, &empty); - state->num_hdrq_done++; - num_hdrq_done++; - done = (!next_hdrq_is_ready() || (ret == IPS_RECVHDRQ_BREAK) - || (num_hdrq_done == num_hdrq_todo)); - - do_hdr_update = (state->head_update_interval ? - (state->num_hdrq_done == - state->head_update_interval) : done); - if (do_hdr_update) { - - psm3_gen1_set_cl_q_head_index( - state->hdrq_head, - gen1_hdr_q, - rcv_ev.recvq->context->psm_hw_ctxt); - /* Reset header queue entries processed */ - state->num_hdrq_done = 0; - } - if (state->num_egrq_done >= state->egrq_update_interval) { - /* Lazy update of egrq */ - if (state->rcv_egr_index_head != NO_EAGER_UPDATE) { - psm3_gen1_set_cl_q_head_index( - state->rcv_egr_index_head, - psm_hal_egr_q, - recvq->context->psm_hw_ctxt); - state->rcv_egr_index_head = NO_EAGER_UPDATE; - state->num_egrq_done = 0; - } - } - if (state->hdr_countdown > 0) { - /* a header entry is consumed. */ - state->hdr_countdown -= hdrq_elemsz; - if (state->hdr_countdown == 0) { - /* header entry count reaches zero. */ - psm3_gen1_cl_idx tail=0; - - tail = psm3_gen1_get_cl_q_tail_index( - psm_hal_egr_q, - recvq->context->psm_hw_ctxt); - - psm3_gen1_cl_idx head=0; - - head = psm3_gen1_get_cl_q_head_index( - psm_hal_egr_q, - recvq->context->psm_hw_ctxt); - - uint32_t egr_cnt = psm3_gen1_get_rx_egr_tid_cnt(recvq->context->psm_hw_ctxt); - /* Checks eager-full again. This is a real false-egr-full */ - if (head == ((tail + 1) % egr_cnt)) { - - psm3_gen1_set_cl_q_tail_index( - tail, - psm_hal_egr_q, - recvq->context->psm_hw_ctxt); - - _HFI_DBG - ("eager array full after overflow, flushing " - "(head %llx, tail %llx)\n", - (long long)head, (long long)tail); - recvq->proto->stats.egr_overflow++; - } else - _HFI_ERROR - ("PSM BUG: EgrOverflow: eager queue is not full\n"); - } - } - } - /* while (hdrq_entries_to_read) */ - - /* Process any pending acks before exiting */ - process_pending_acks(recvq); - - PSM2_LOG_MSG("leaving"); - GENERIC_PERF_END(PSM_RX_SPEEDPATH_CTR); /* perf stats */ - return num_hdrq_done ? PSM2_OK : PSM2_OK_NO_PROGRESS; -} - -/* This function is designed to implement RAPID CCA. It iterates - through the recvq, checking each element for set FECN or BECN bits. - In the case of finding one, the proper response is executed, and the bits - are cleared. -*/ -psm2_error_t psm3_gen1_recvhdrq_scan_cca (struct ips_recvhdrq *recvq) -{ -// TBD - rcv_ev is never returned from this, is_congested and congested_pkts counts never used - -/* Looks at hdr and determines if it is the last item in the queue */ - -#define is_last_hdr(idx) \ - psm3_gen1_cl_q_empty(idx, gen1_hdr_q, recvq->context->psm_hw_ctxt) - - struct ips_recvhdrq_state *state = recvq->state; - PSMI_CACHEALIGN struct ips_recvhdrq_event rcv_ev = {.proto = recvq->proto, - .recvq = recvq - }; - - uint32_t num_hdrq_done = state->hdrq_cachedlastscan / - psm3_gen1_get_rx_hdr_q_ent_size(recvq->context->psm_hw_ctxt) >> BYTE2DWORD_SHIFT; - const int num_hdrq_todo = psm3_gen1_get_rx_hdr_q_cnt(recvq->context->psm_hw_ctxt); - const uint32_t hdrq_elemsz = psm3_gen1_get_rx_hdr_q_ent_size(recvq->context->psm_hw_ctxt) >> BYTE2DWORD_SHIFT; - - int done; - uint32_t scan_head = state->hdrq_head + state->hdrq_cachedlastscan; - const psm3_gen1_cl_q gen1_hdr_q = recvq->gen1_cl_hdrq; - - /* Skip the first element, since we're going to process it soon anyway */ - if ( state->hdrq_cachedlastscan == 0 ) - { - scan_head += hdrq_elemsz; - num_hdrq_done++; - } - - PSM2_LOG_MSG("entering"); - done = !is_last_hdr(scan_head); - rcv_ev.gen1_hdr_q = gen1_hdr_q; - while (!done) { - psm3_gen1_get_receive_event(scan_head, recvq->context->psm_hw_ctxt, 0, - &rcv_ev); - _HFI_VDBG - ("scanning new packet for CCA: rcv_hdr %p, rhf %" PRIx64 "\n", - rcv_ev.p_hdr, rcv_ev.gen1_rhf.raw_rhf); - - if_pt ( _is_cca_fecn_set(rcv_ev.p_hdr) & IPS_RECV_EVENT_FECN ) { - struct ips_epstate_entry *epstaddr = ips_epstate_lookup(recvq->epstate, - rcv_ev.p_hdr->connidx); - - if (epstaddr != NULL && epstaddr->ipsaddr != NULL) - { - rcv_ev.ipsaddr = epstaddr->ipsaddr; - - /* Send BECN back */ - ips_epaddr_t *ipsaddr = rcv_ev.ipsaddr; - struct ips_message_header *p_hdr = rcv_ev.p_hdr; - ips_epaddr_flow_t flowid = ips_proto_flowid(p_hdr); - struct ips_flow *flow; - ips_scb_t ctrlscb; - - psmi_assert(flowid < EP_FLOW_LAST); - flow = &ipsaddr->flows[flowid]; - ctrlscb.scb_flags = 0; - ctrlscb.ips_lrh.data[0].u32w0 = - flow->cca_ooo_pkts; - - rcv_ev.proto->epaddr_stats.congestion_pkts++; - /* Clear FECN event */ - rcv_ev.is_congested &= ~IPS_RECV_EVENT_FECN; - - // no payload, pass cksum so non-NULL - psm3_ips_proto_send_ctrl_message(flow, - OPCODE_BECN, - &flow->ipsaddr-> - ctrl_msg_queued, - &ctrlscb, ctrlscb.cksum, 0); - } - } - else if_pt (0 != (_is_cca_becn_set(rcv_ev.p_hdr) << (IPS_RECV_EVENT_BECN - 1))) { - struct ips_epstate_entry *epstaddr = ips_epstate_lookup(recvq->epstate, - rcv_ev.p_hdr->connidx); - - if (epstaddr != NULL && epstaddr->ipsaddr != NULL) - { - rcv_ev.ipsaddr = epstaddr->ipsaddr; - - /* Adjust flow */ - struct ips_proto *proto = rcv_ev.proto; - struct ips_message_header *p_hdr = rcv_ev.p_hdr; - ips_epaddr_t *ipsaddr = rcv_ev.ipsaddr; - struct ips_flow *flow; - ips_epaddr_flow_t flowid = ips_proto_flowid(p_hdr); - - psmi_assert(flowid < EP_FLOW_LAST); - flow = &ipsaddr->flows[flowid]; - if ((flow->path->opa.pr_ccti + - proto->cace[flow->path->pr_sl].ccti_increase) <= proto->ccti_limit) { - ips_cca_adjust_rate(flow->path, - proto->cace[flow->path->pr_sl].ccti_increase); - /* Clear congestion event */ - rcv_ev.is_congested &= ~IPS_RECV_EVENT_BECN; - } - } - } - - num_hdrq_done++; - scan_head += hdrq_elemsz; - state->hdrq_cachedlastscan += hdrq_elemsz; - - done = (num_hdrq_done == num_hdrq_todo && !is_last_hdr(scan_head) ); - - } - /* while (hdrq_entries_to_read) */ - - - PSM2_LOG_MSG("leaving"); - return PSM2_OK; -} -#endif /* PSM_OPA */ diff --git a/psm3/hal_gen1/gen1_sdma.c b/psm3/hal_gen1/gen1_sdma.c deleted file mode 100644 index 1599796..0000000 --- a/psm3/hal_gen1/gen1_sdma.c +++ /dev/null @@ -1,893 +0,0 @@ -#ifdef PSM_OPA -/* - - This file is provided under a dual BSD/GPLv2 license. When using or - redistributing this file, you may do so under either license. - - GPL LICENSE SUMMARY - - Copyright(c) 2017 Intel Corporation. - - This program is free software; you can redistribute it and/or modify - it under the terms of version 2 of the GNU General Public License as - published by the Free Software Foundation. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - Contact Information: - Intel Corporation, www.intel.com - - BSD LICENSE - - Copyright(c) 2017 Intel Corporation. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -/* Copyright (c) 2003-2017 Intel Corporation. All rights reserved. */ - -/* included header files */ -#include -#include -#include -#include -#include - -#include "psm_user.h" -#include "ips_proto_params.h" -#include "psm2_hal.h" -#include "ips_proto.h" -#include "gen1_user.h" -#include "psmi_wrappers.h" -#include "gen1_hal.h" - -// could just replace with HFI_SDMA_HDR_SIZE in callers. -// should always be HFI_SDMA_HDR_SIZE -PSMI_ALWAYS_INLINE(int psm3_gen1_get_sdma_req_size(psmi_hal_hw_context ctxt)) -{ - return get_psm_gen1_hi()->hfp_private.sdmahdr_req_size; -} - -PSMI_ALWAYS_INLINE(int psm3_gen1_get_sdma_ring_slot_status(int slotIdx, - psmi_hal_sdma_ring_slot_status *status, - uint32_t *errorCode, - psmi_hal_hw_context ctxt)) -{ - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; - - if (slotIdx < 0 || slotIdx >= ctrl->ctxt_info.sdma_ring_size) - { - *status = PSM_HAL_SDMA_RING_ERROR; - return -PSM_HAL_ERROR_GENERAL_ERROR; - } - - struct hfi1_sdma_comp_entry *sdma_comp_queue = (struct hfi1_sdma_comp_entry *) - ctrl->base_info.sdma_comp_bufbase; - - switch (sdma_comp_queue[slotIdx].status) - { - case FREE: - *status = PSM_HAL_SDMA_RING_AVAILABLE; - break; - case QUEUED: - *status = PSM_HAL_SDMA_RING_QUEUED; - break; - case COMPLETE: - *status = PSM_HAL_SDMA_RING_COMPLETE; - break; - case ERROR: - *status = PSM_HAL_SDMA_RING_ERROR; - break; - default: - *status = PSM_HAL_SDMA_RING_ERROR; - return -PSM_HAL_ERROR_GENERAL_ERROR; - } - *errorCode = sdma_comp_queue[slotIdx].errcode; - return PSM_HAL_ERROR_OK; -} - -/* Returns > 0 if the specified slot is available. 0 if not available - and a negative value if an error occurred. */ -PSMI_ALWAYS_INLINE(int psm3_gen1_dma_slot_available(int slotidx, psmi_hal_hw_context ctxt)) -{ - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; - - if (slotidx < 0 || slotidx >= ctrl->ctxt_info.sdma_ring_size) - return -1; - - struct hfi1_sdma_comp_entry *sdma_comp_queue = (struct hfi1_sdma_comp_entry *) - ctrl->base_info.sdma_comp_bufbase; - - return sdma_comp_queue[slotidx].status != QUEUED; -} - -/* Initiate a DMA. Intrinsically specifies a DMA slot to use. */ -PSMI_ALWAYS_INLINE(int psm3_gen1_writev(const struct iovec *iov, int iovcnt, struct ips_epinfo *ignored, psmi_hal_hw_context ctxt)) -{ - hfp_gen1_pc_private *psm_hw_ctxt = (hfp_gen1_pc_private *)ctxt; - - return psm3_gen1_nic_cmd_writev(psm_hw_ctxt->ctrl->fd, iov, iovcnt); -} - -/* - * Driver defines the following sdma completion error code, returned - * as negative value: - * #define SDMA_TXREQ_S_OK 0 - * #define SDMA_TXREQ_S_SENDERROR 1 - * #define SDMA_TXREQ_S_ABORTED 2 - * #define SDMA_TXREQ_S_SHUTDOWN 3 - * - * When hfi is in freeze mode, driver will complete all the pending - * sdma request as aborted. Since PSM needs to recover from hfi - * freeze mode, this routine ignore aborted error. - */ -psm2_error_t psm3_gen1_dma_completion_update(struct ips_proto *proto) -{ - ips_scb_t *scb; - - while (proto->sdma_done_index != proto->sdma_fill_index) { - psmi_hal_sdma_ring_slot_status status; - uint32_t errorCode; - int rc = psm3_gen1_get_sdma_ring_slot_status(proto->sdma_done_index, &status, &errorCode, - proto->ep->context.psm_hw_ctxt); - psmi_rmb(); - - if (rc < 0) - return PSM2_INTERNAL_ERR; - - if (status == PSM_HAL_SDMA_RING_QUEUED) - return PSM2_OK; - - /* Mark sdma request is complete */ - scb = proto->sdma_scb_queue[proto->sdma_done_index]; - if (scb) - { - psmi_assert(status == PSM_HAL_SDMA_RING_COMPLETE); - scb->sdma_outstanding--; - proto->sdma_scb_queue[proto->sdma_done_index] = NULL; - } - - if (status == PSM_HAL_SDMA_RING_ERROR && (int)errorCode != -2) { - psm2_error_t err = - psm3_handle_error(proto->ep, PSM2_EP_DEVICE_FAILURE, - "SDMA completion error: %d (fd=%d, index=%d)", - 0 - ((int32_t)errorCode), - psm3_gen1_get_fd(proto->ep->context. - psm_hw_ctxt), - proto->sdma_done_index); - return err; - } - - proto->sdma_avail_counter++; - proto->sdma_done_index++; - if (proto->sdma_done_index == proto->sdma_queue_size) - proto->sdma_done_index = 0; - } - - return PSM2_OK; -} - -#ifdef PSM_FI -/* - * Fault injection in dma sends. Since DMA through writev() is all-or-nothing, - * we don't inject faults on a packet-per-packet basis since the code gets - * quite complex. Instead, each call to flush_dma or transfer_frame is treated - * as an "event" and faults are generated according to the IPS_FAULTINJ_DMASEND - * setting. - * - * The effect is as if the event was successful but dropped on the wire - * somewhere. - */ -PSMI_ALWAYS_INLINE(int dma_do_fault(psm2_ep_t ep)) -{ - - if_pf(PSM3_FAULTINJ_ENABLED()) { - PSM3_FAULTINJ_STATIC_DECL(fi, "dmalost", - "discard SDMA packets before sending", - 1, IPS_FAULTINJ_DMALOST); - return PSM3_FAULTINJ_IS_FAULT(fi, ep, ""); - } - else - return 0; -} -#endif /* #ifdef PSM_FI */ - -/* - -Handles ENOMEM on a DMA completion. - - */ -static inline -psm2_error_t -handle_ENOMEM_on_DMA_completion(struct ips_proto *proto) -{ - psm2_error_t err; - time_t now = time(NULL); - - if (proto->protoexp && proto->protoexp->tidc.tid_cachemap.payload.nidle) { - uint64_t lengthEvicted = - ips_tidcache_evict(&proto->protoexp->tidc, -1); - - if (!proto->writevFailTime) - proto->writevFailTime = now; - - if (lengthEvicted) - return PSM2_OK; /* signals a retry of the writev command. */ - else { -#ifdef PSM_CUDA - if (PSMI_IS_GDR_COPY_ENABLED && psm3_gen1_gdr_cache_evict()) { - return PSM2_OK; - } else -#endif - return PSM2_EP_NO_RESOURCES; /* should signal a return of - no progress, and retry later */ - } - } -#ifdef PSM_CUDA - else if (PSMI_IS_GDR_COPY_ENABLED) { - uint64_t lengthEvicted = psm3_gen1_gdr_cache_evict(); - if (!proto->writevFailTime) - proto->writevFailTime = now; - - if (lengthEvicted) - return PSM2_OK; - else - return PSM2_EP_NO_RESOURCES; - } -#endif - else if (!proto->writevFailTime) - { - proto->writevFailTime = now; - return PSM2_EP_NO_RESOURCES; /* should signal a return of - no progress, and retry later */ - } - else - { - static const double thirtySeconds = 30.0; - - if (difftime(now, proto->writevFailTime) > - thirtySeconds) { - err = psm3_handle_error( - proto->ep, - PSM2_EP_DEVICE_FAILURE, - "SDMA completion error: out of " - "memory (fd=%d, index=%d)", - psm3_gen1_get_fd(proto->ep->context.psm_hw_ctxt), - proto->sdma_done_index); - return err; - } - return PSM2_EP_NO_RESOURCES; /* should signal a return of - no progress, and retry later */ - } -} - -/* - * Flush all packets currently marked as pending - * Caller still expects num_sent to always be correctly set in case of an - * error. - * - * Recoverable errors: - * PSM2_OK: At least one packet was successfully queued up for DMA. - * PSM2_EP_NO_RESOURCES: No scb's available to handle unaligned packets - * or writev returned a recoverable error (no mem for - * descriptors, dma interrupted or no space left in dma - * queue). - * PSM2_OK_NO_PROGRESS: Cable pulled. - * - * Unrecoverable errors: - * PSM2_EP_DEVICE_FAILURE: Error calling hfi_sdma_inflight() or unexpected - * error in calling writev(), or chip failure, rxe/txe - * parity error. - * PSM2_EP_NO_NETWORK: No network, no lid, ... - */ -psm2_error_t -psm3_gen1_dma_send_pending_scbs(struct ips_proto *proto, struct ips_flow *flow, - struct ips_scb_pendlist *slist, int *num_sent) -{ - psm2_error_t err = PSM2_OK; - struct psm_hal_sdma_req_info *sdmahdr; - struct ips_scb *scb; - struct iovec *iovec; - uint16_t iovcnt; - - unsigned int vec_idx = 0; - unsigned int scb_idx = 0, scb_sent = 0; - unsigned int num = 0, max_elem; - uint32_t have_cksum; - uint32_t fillidx; - int16_t credits; -#ifdef PSM_BYTE_FLOW_CREDITS - int16_t credit_bytes; -#endif - ssize_t ret; - -#ifdef PSM_FI - /* See comments above for fault injection */ - if_pf(dma_do_fault(proto->ep)) goto fail; -#endif /* #ifdef PSM_FI */ - - /* Check how many SCBs to send based on flow credits */ - credits = flow->credits; -#ifdef PSM_BYTE_FLOW_CREDITS - credit_bytes = flow->credit_bytes; -#endif - psmi_assert(SLIST_FIRST(slist) != NULL); - SLIST_FOREACH(scb, slist, next) { - num++; - credits -= scb->nfrag; -#ifdef PSM_BYTE_FLOW_CREDITS - credit_bytes -= scb->chunk_size; - if (credits <= 0 || credit_bytes <= 0) - break; -#else - if (credits <= 0) - break; -#endif - } - if (proto->sdma_avail_counter < num) { - /* if there is not enough sdma slot, - * update and use what we have. - */ - err = psm3_gen1_dma_completion_update(proto); - if (err) - goto fail; - if (proto->sdma_avail_counter == 0) { - err = PSM2_EP_NO_RESOURCES; - goto fail; - } - if (proto->sdma_avail_counter < num) - num = proto->sdma_avail_counter; - } - - /* header, payload, checksum, tidarray */ - max_elem = 4 * num; - iovec = alloca(sizeof(struct iovec) * max_elem); - - fillidx = proto->sdma_fill_index; - SLIST_FOREACH(scb, slist, next) { - /* Can't exceed posix max writev count */ - if (vec_idx + (int)!!(scb->payload_size > 0) >= UIO_MAXIOV) - break; - - psmi_assert(vec_idx < max_elem); - psmi_assert_always(((scb->payload_size & 0x3) == 0) || - psmi_hal_has_cap(PSM_HAL_CAP_NON_DW_MULTIPLE_MSG_SIZE)); - - /* Checksum all eager packets */ - have_cksum = scb->ips_lrh.flags & IPS_SEND_FLAG_PKTCKSUM; - - /* - * Setup PBC. - */ - psm3_gen1_pbc_update( - proto, - flow, - PSMI_FALSE, - &scb->pbc, - HFI_MESSAGE_HDR_SIZE, - scb->payload_size + - (have_cksum ? PSM_CRC_SIZE_IN_BYTES : 0)); - - psmi_assert(psm3_gen1_dma_slot_available(fillidx, proto->ep->context. - psm_hw_ctxt)); - - size_t extra_bytes; - sdmahdr = psm3_get_sdma_req_info(scb, &extra_bytes); - - // for nfrag==1, *remaining and frag_size undefined - sdmahdr->npkts = - scb->nfrag > 1 ? scb->nfrag_remaining : scb->nfrag; - sdmahdr->fragsize = - scb->nfrag > 1 ? scb->frag_size : flow->frag_size; - - sdmahdr->comp_idx = fillidx; - fillidx++; - if (fillidx == proto->sdma_queue_size) - fillidx = 0; - - /* - * Setup io vector. - */ - iovec[vec_idx].iov_base = sdmahdr; - iovec[vec_idx].iov_len = psm3_gen1_get_sdma_req_size(proto->ep->context. - psm_hw_ctxt) + extra_bytes; - vec_idx++; - iovcnt = 1; - _HFI_VDBG("hdr=%p,%d\n", - iovec[vec_idx - 1].iov_base, - (int)iovec[vec_idx - 1].iov_len); - - if (scb->payload_size > 0) { - /* - * OPA1 supports byte-aligned payload. If it is - * single packet per scb, use payload_size, else - * multi-packets per scb, use remaining chunk_size. - * payload_size is the remaining chunk first packet - * length. - */ - iovec[vec_idx].iov_base = ips_scb_buffer(scb); - iovec[vec_idx].iov_len = scb->nfrag > 1 - ? scb->chunk_size_remaining - : scb->payload_size; - vec_idx++; - iovcnt++; -#ifdef PSM_CUDA - if (PSMI_IS_GPU_ENABLED && IS_TRANSFER_BUF_GPU_MEM(scb)) { - /* without this attr, CUDA memory accesses - * do not synchronize with gpudirect-rdma accesses. - * We set this field only if the currently loaded driver - * supports this field. If not, we have other problems - * where we have a non gpu-direct enabled driver loaded - * and PSM2 is trying to use GPU features. - */ - if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED) - sdmahdr->flags = PSM_HAL_BUF_GPU_MEM; - else - sdmahdr->flags = 0; - } else if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED) - sdmahdr->flags = 0; - _HFI_VDBG("seqno=%d hdr=%p,%d,flags 0x%x payload=%p,%d\n", - scb->seq_num.psn_num, - iovec[vec_idx - 2].iov_base, - (int)iovec[vec_idx - 2].iov_len, - sdmahdr->flags, - iovec[vec_idx - 1].iov_base, - (int)iovec[vec_idx - 1].iov_len); -#else - _HFI_VDBG("seqno=%d hdr=%p,%d payload=%p,%d\n", - scb->seq_num.psn_num, - iovec[vec_idx - 2].iov_base, - (int)iovec[vec_idx - 2].iov_len, - iovec[vec_idx - 1].iov_base, - (int)iovec[vec_idx - 1].iov_len); -#endif - } - - /* If checksum then update checksum */ - if (have_cksum) { - scb->cksum[1] = scb->cksum[0]; - iovec[vec_idx].iov_base = scb->cksum; - iovec[vec_idx].iov_len = PSM_CRC_SIZE_IN_BYTES; - vec_idx++; - iovcnt++; - - _HFI_VDBG("chsum=%p,%d\n", - iovec[vec_idx - 1].iov_base, - (int)iovec[vec_idx - 1].iov_len); - } - - /* - * If it is TID receive, attached tid info. - */ - if (scb->tidctrl) { - iovec[vec_idx].iov_base = scb->tsess; - iovec[vec_idx].iov_len = scb->tsess_length; - vec_idx++; - iovcnt++; - -#ifdef PSM_CUDA - /* - * The driver knows to check for "flags" field in - * sdma_req_info only if ctrl=2. - */ - if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED) { - sdmahdr->ctrl = 2 | - (PSM_HAL_EXP << PSM_HAL_SDMA_REQ_OPCODE_SHIFT) | - (iovcnt << PSM_HAL_SDMA_REQ_IOVCNT_SHIFT); - } else -#endif - { - - sdmahdr->ctrl = 1 | - (PSM_HAL_EXP << PSM_HAL_SDMA_REQ_OPCODE_SHIFT) | - (iovcnt << PSM_HAL_SDMA_REQ_IOVCNT_SHIFT); - } - _HFI_VDBG("tid-info=%p,%d\n", - iovec[vec_idx - 1].iov_base, - (int)iovec[vec_idx - 1].iov_len); - } else { - -#ifdef PSM_CUDA - if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED) { - sdmahdr->ctrl = 2 | - (PSM_HAL_EGR << PSM_HAL_SDMA_REQ_OPCODE_SHIFT) | - (iovcnt << PSM_HAL_SDMA_REQ_IOVCNT_SHIFT); - } else -#endif - { - sdmahdr->ctrl = 1 | - (PSM_HAL_EGR << PSM_HAL_SDMA_REQ_OPCODE_SHIFT) | - (iovcnt << PSM_HAL_SDMA_REQ_IOVCNT_SHIFT); - } - } - - /* Can bound the number to send by 'num' */ - if (++scb_idx == num) - break; - } - psmi_assert(vec_idx > 0); -retry: - ret = psm3_gen1_writev(iovec, vec_idx, &proto->epinfo, proto->ep->context.psm_hw_ctxt); - - if (ret > 0) { - proto->writevFailTime = 0; - /* No need for inflight system call, we can infer it's value - * from - * writev's return value */ - scb_sent += ret; - } else { - /* - * ret == 0: Driver did not queue packet. Try later. - * ENOMEM: No kernel memory to queue request, try later? - * ECOMM: Link may have gone down - * EINTR: Got interrupt while in writev - */ - if (errno == ENOMEM) { - err = handle_ENOMEM_on_DMA_completion(proto); - if (err == PSM2_OK) - goto retry; - } else if (ret == 0 || errno == ECOMM || errno == EINTR) { - err = psm3_gen1_context_check_hw_status(proto->ep); - /* - * During a link bounce the err returned from - * psm3_context_check_status is PSM2_EP_NO_NETWORK. In this case - * the error code which we need to return to the calling flush - * function(ips_proto_flow_flush_dma) is PSM2_EP_NO_RESOURCES to - * signal the caller to restart the timers to flush the packets. - * Not doing so would leave the packet on the unacked and - * pending q without the sdma descriptors ever being updated. - */ - if (err == PSM2_OK || err == PSM2_EP_NO_NETWORK) - err = PSM2_EP_NO_RESOURCES; - } else { - err = psm3_handle_error( - proto->ep, - PSM2_EP_DEVICE_FAILURE, - "Unexpected error in writev(): %s (errno=%d) " - "(fd=%d,iovec=%p,len=%d)", - strerror(errno), - errno, - psm3_gen1_get_fd(proto->ep->context.psm_hw_ctxt), - iovec, - vec_idx); - goto fail; - } - } - -fail: - *num_sent = scb_sent; - psmi_assert(*num_sent <= num && *num_sent >= 0); - return err; -} - -/* dma_transfer_frame is used only for control messages, and is - * not enabled by default, and not tested by QA; expected send - * dma goes through dma_send_pending_scbs() */ -psm2_error_t -psm3_gen1_dma_transfer_frame(struct ips_proto *proto, struct ips_flow *flow, - ips_scb_t *scb, void *payload, uint32_t paylen, - uint32_t have_cksum, uint32_t cksum) -{ - ssize_t ret; - psm2_error_t err; - struct psm_hal_sdma_req_info *sdmahdr; - uint16_t iovcnt; - struct iovec iovec[2]; - -#ifdef PSM_FI - /* See comments above for fault injection */ - if_pf(dma_do_fault(proto->ep)) - return PSM2_OK; -#endif /* #ifdef PSM_FI */ - /* - * Check if there is a sdma queue slot. - */ - if (proto->sdma_avail_counter == 0) { - err = psm3_gen1_dma_completion_update(proto); - if (err) - return err; - - if (proto->sdma_avail_counter == 0) { - return PSM2_EP_NO_RESOURCES; - } - } - - /* - * If we have checksum, put to the end of payload. We make sure - * there is enough space in payload for us to put 8 bytes checksum. - * for control message, payload is internal PSM buffer, not user buffer. - */ - if (have_cksum) { - uint32_t *ckptr = (uint32_t *) ((char *)payload + paylen); - *ckptr = cksum; - ckptr++; - *ckptr = cksum; - paylen += PSM_CRC_SIZE_IN_BYTES; - } - - /* - * Setup PBC. - */ - psm3_gen1_pbc_update(proto, flow, PSMI_TRUE, - &scb->pbc, HFI_MESSAGE_HDR_SIZE, paylen); - - /* - * Setup SDMA header and io vector. - */ - size_t extra_bytes; - sdmahdr = psm3_get_sdma_req_info(scb, &extra_bytes); - sdmahdr->npkts = 1; - sdmahdr->fragsize = flow->frag_size; - sdmahdr->comp_idx = proto->sdma_fill_index; - psmi_assert(psm3_gen1_dma_slot_available(proto->sdma_fill_index, proto->ep->context.psm_hw_ctxt)); - - iovcnt = 1; - iovec[0].iov_base = sdmahdr; - iovec[0].iov_len = psm3_gen1_get_sdma_req_size(proto->ep->context.psm_hw_ctxt) + extra_bytes; - - if (paylen > 0) { - iovcnt++; - iovec[1].iov_base = payload; - iovec[1].iov_len = paylen; - } - -#ifdef PSM_CUDA - if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED) { - sdmahdr->ctrl = 2 | - (PSM_HAL_EGR << PSM_HAL_SDMA_REQ_OPCODE_SHIFT) | - (iovcnt << PSM_HAL_SDMA_REQ_IOVCNT_SHIFT); - } else -#endif - { - sdmahdr->ctrl = 1 | - (PSM_HAL_EGR << PSM_HAL_SDMA_REQ_OPCODE_SHIFT) | - (iovcnt << PSM_HAL_SDMA_REQ_IOVCNT_SHIFT); - } - - /* - * Write into driver to do SDMA work. - */ -retry: - ret = psm3_gen1_writev(iovec, iovcnt, &proto->epinfo, proto->ep->context.psm_hw_ctxt); - - if (ret > 0) { - proto->writevFailTime = 0; - psmi_assert_always(ret == 1); - - proto->sdma_avail_counter--; - proto->sdma_fill_index++; - if (proto->sdma_fill_index == proto->sdma_queue_size) - proto->sdma_fill_index = 0; - - /* - * Wait for completion of this control message if - * stack buffer payload is used. This should not be - * a performance issue because sdma control message - * is not a performance code path. - */ - if (iovcnt > 1) { - /* Setup scb ready for completion. */ - psmi_assert(proto->sdma_scb_queue - [sdmahdr->comp_idx] == NULL); - proto->sdma_scb_queue[sdmahdr->comp_idx] = scb; - scb->sdma_outstanding++; - - /* Wait for completion */ - proto->stats.sdma_compl_wait_ctrl++; - err = ips_proto_dma_wait_until(proto, scb); - } else - err = PSM2_OK; - } else { - /* - * ret == 0: Driver did not queue packet. Try later. - * ENOMEM: No kernel memory to queue request, try later? * - * ECOMM: Link may have gone down - * EINTR: Got interrupt while in writev - */ - if (errno == ENOMEM) { - err = handle_ENOMEM_on_DMA_completion(proto); - if (err == PSM2_OK) - goto retry; - } else if (ret == 0 || errno == ECOMM || errno == EINTR) { - err = psm3_gen1_context_check_hw_status(proto->ep); - /* - * During a link bounce the err returned from - * psm3_context_check_status is PSM2_EP_NO_NETWORK. In this case - * the error code which we need to return to the calling flush - * function(ips_proto_flow_flush_dma) is PSM2_EP_NO_RESOURCES to - * signal it to restart the timers to flush the packets. - * Not doing so would leave the packet on the unacked and - * pending q without the sdma descriptors ever being updated. - */ - if (err == PSM2_OK || err == PSM2_EP_NO_NETWORK) - err = PSM2_EP_NO_RESOURCES; - } - - else - err = psm3_handle_error(proto->ep, - PSM2_EP_DEVICE_FAILURE, - "Unhandled error in writev(): " - "%s (fd=%d,iovec=%p,len=%d)", - strerror(errno), - psm3_gen1_get_fd(proto->ep->context.psm_hw_ctxt), - &iovec, - 1); - } - - return err; -} - -PSMI_ALWAYS_INLINE(uint64_t psm3_gen1_get_hw_status(psmi_hal_hw_context ctxt)) -{ - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; - struct hfi1_status *status = - (struct hfi1_status *) ctrl->base_info.status_bufbase; - uint64_t hw_status = 0; - int i; - - // TBD - known issue, when HAL is built as pure inline - // can't declare static variables in an inline function - // (and shouldn't delcare in a header file in general) - static const struct - { - uint32_t hfi1_status_dev_bit, psmi_hal_status_bit; - } status_dev_map[] = - { - { HFI1_STATUS_INITTED, PSM_HAL_HW_STATUS_INITTED }, - { HFI1_STATUS_CHIP_PRESENT, PSM_HAL_HW_STATUS_CHIP_PRESENT }, - { HFI1_STATUS_HWERROR, PSM_HAL_HW_STATUS_HWERROR }, - }; - - for (i=0; i < sizeof(status_dev_map)/sizeof(status_dev_map[0]); i++) - { - if (status->dev &status_dev_map[i].hfi1_status_dev_bit) - hw_status |= status_dev_map[i].psmi_hal_status_bit; - } - - static const struct - { - uint32_t hfi1_status_port_bit, psmi_hal_status_bit; - } status_port_map[] = - { - { HFI1_STATUS_IB_READY, PSM_HAL_HW_STATUS_IB_READY }, - { HFI1_STATUS_IB_CONF, PSM_HAL_HW_STATUS_IB_CONF }, - }; - - for (i=0; i < sizeof(status_port_map)/sizeof(status_port_map[0]); i++) - { - if (status->port &status_port_map[i].hfi1_status_port_bit) - hw_status |= status_port_map[i].psmi_hal_status_bit; - } - - return hw_status; -} - -PSMI_ALWAYS_INLINE(int psm3_gen1_get_hw_status_freezemsg(volatile char** msg, psmi_hal_hw_context ctxt)) -{ - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; - struct hfi1_status *status = - (struct hfi1_status *) ctrl->base_info.status_bufbase; - - *msg = (volatile char *) status->freezemsg; - - return PSM2_OK; -} - -/* - * This function works whether a context is initialized or not in a psm2_ep. - * - * Returns one of - * - * PSM2_OK: Port status is ok (or context not initialized yet but still "ok") - * PSM2_OK_NO_PROGRESS: Cable pulled - * PSM2_EP_NO_NETWORK: No network, no lid, ... - * PSM2_EP_DEVICE_FAILURE: Chip failures, rxe/txe parity, etc. - * The message follows the per-port status - * As of 7322-ready driver, need to check port-specific qword for IB - * as well as older unit-only. For now, we don't have the port interface - * defined, so just check port 0 qword for spi_status - */ -psm2_error_t psm3_gen1_context_check_hw_status(psm2_ep_t ep) -{ - psm2_error_t err = PSM2_OK; - psmi_context_t *context = &ep->context; - char *errmsg = NULL; - uint64_t status = psm3_gen1_get_hw_status(context->psm_hw_ctxt); - - /* Fatal chip-related errors */ - if (!(status & PSM_HAL_HW_STATUS_CHIP_PRESENT) || - !(status & PSM_HAL_HW_STATUS_INITTED) || - (status & PSM_HAL_HW_STATUS_HWERROR)) { - - err = PSM2_EP_DEVICE_FAILURE; - if (err != context->status_lasterr) { /* report once */ - volatile char *errmsg_sp="no err msg"; - - psm3_gen1_get_hw_status_freezemsg(&errmsg_sp, - context->psm_hw_ctxt); - - if (*errmsg_sp) - psm3_handle_error(ep, err, - "Hardware problem: %s", - errmsg_sp); - else { - if (status & PSM_HAL_HW_STATUS_HWERROR) - errmsg = "Hardware error"; - else - errmsg = "Hardware not found"; - - psm3_handle_error(ep, err, "%s", errmsg); - } - } - } - /* Fatal network-related errors with timeout: */ - else if (!(status & PSM_HAL_HW_STATUS_IB_CONF) || - !(status & PSM_HAL_HW_STATUS_IB_READY)) { - err = PSM2_EP_NO_NETWORK; - if (err != context->status_lasterr) { /* report once */ - context->networkLostTime = time(NULL); - } - else - { - time_t now = time(NULL); - static const double seventySeconds = 70.0; - - /* The linkup time duration for a system should allow the time needed - to complete 3 LNI passes which is: - 50 seconds for a passive copper channel - 65 seconds for optical channel. - (we add 5 seconds of margin.) */ - if (difftime(now,context->networkLostTime) > seventySeconds) - { - volatile char *errmsg_sp="no err msg"; - - psm3_gen1_get_hw_status_freezemsg(&errmsg_sp, - context->psm_hw_ctxt); - - psm3_handle_error(ep, err, "%s", - *errmsg_sp ? errmsg_sp : - "Network down"); - } - } - } - - if (err == PSM2_OK && context->status_lasterr != PSM2_OK) - context->status_lasterr = PSM2_OK; /* clear error */ - else if (err != PSM2_OK) - context->status_lasterr = err; /* record error */ - - return err; -} -#endif /* PSM_OPA */ diff --git a/psm3/hal_gen1/gen1_sdma.h b/psm3/hal_gen1/gen1_sdma.h deleted file mode 100644 index c4ade6c..0000000 --- a/psm3/hal_gen1/gen1_sdma.h +++ /dev/null @@ -1,76 +0,0 @@ -#ifdef PSM_OPA -/* - - This file is provided under a dual BSD/GPLv2 license. When using or - redistributing this file, you may do so under either license. - - GPL LICENSE SUMMARY - - Copyright(c) 2017 Intel Corporation. - - This program is free software; you can redistribute it and/or modify - it under the terms of version 2 of the GNU General Public License as - published by the Free Software Foundation. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - Contact Information: - Intel Corporation, www.intel.com - - BSD LICENSE - - Copyright(c) 2017 Intel Corporation. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -/* Copyright (c) 2003-2017 Intel Corporation. All rights reserved. */ - -#ifndef PSM_HAL_GEN1_SDMA_H -#define PSM_HAL_GEN1_SDMA_H - -#include "psm_user.h" -struct ips_proto; - -psm2_error_t psm3_gen1_dma_completion_update(struct ips_proto *proto); - -psm2_error_t -psm3_gen1_dma_send_pending_scbs(struct ips_proto *proto, struct ips_flow *flow, - struct ips_scb_pendlist *slist, int *num_sent); -psm2_error_t -psm3_gen1_dma_transfer_frame(struct ips_proto *proto, struct ips_flow *flow, - ips_scb_t *scb, void *payload, uint32_t paylen, - uint32_t have_cksum, uint32_t cksum); - -psm2_error_t psm3_gen1_context_check_hw_status(psm2_ep_t ep); - -#endif /* PSM_HAL_GEN1_SDMA_H */ -#endif /* PSM_OPA */ diff --git a/psm3/hal_gen1/gen1_service.c b/psm3/hal_gen1/gen1_service.c deleted file mode 100644 index b56d77a..0000000 --- a/psm3/hal_gen1/gen1_service.c +++ /dev/null @@ -1,972 +0,0 @@ -#ifdef PSM_OPA -/* - - This file is provided under a dual BSD/GPLv2 license. When using or - redistributing this file, you may do so under either license. - - GPL LICENSE SUMMARY - - Copyright(c) 2018 Intel Corporation. - - This program is free software; you can redistribute it and/or modify - it under the terms of version 2 of the GNU General Public License as - published by the Free Software Foundation. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - Contact Information: - Intel Corporation, www.intel.com - - BSD LICENSE - - Copyright(c) 2018 Intel Corporation. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -/* This file contains hfi service routine interface used by the low - level hfi protocol code. */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "utils_sysfs.h" -#include "gen1_service.h" -#include "psmi_wrappers.h" -#include "psm_netutils.h" - -typedef union -{ - struct - { - uint16_t minor; - uint16_t major; - }; - uint32_t version; -} sw_version_t; - -static sw_version_t sw_version = -{ - { - .major = HFI1_USER_SWMAJOR, - .minor = HFI1_USER_SWMINOR - } -}; - -/* fwd declaration */ -ustatic int psm3_gen1_old_nic_cmd_write(int fd, struct hfi1_cmd *cmd, size_t count); - -#ifdef PSM2_SUPPORT_IW_CMD_API -/* fwd declaration */ -ustatic int psm3_gen1_nic_cmd_ioctl(int fd, struct hfi1_cmd *cmd, size_t count); - -/* Function pointer. */ -static int (*psm3_gen1_nic_cmd_send)(int fd, struct hfi1_cmd *cmd, size_t count) = psm3_gen1_nic_cmd_ioctl; -#else -/* Function pointer. */ -static int (*const psm3_gen1_nic_cmd_send)(int fd, struct hfi1_cmd *cmd, size_t count) = psm3_gen1_old_nic_cmd_write; -#endif - -uint16_t psm3_gen1_get_user_major_version(void) -{ - return sw_version.major; -} - -void psm3_gen1_set_user_major_version(uint16_t major_version) -{ - sw_version.major = major_version; -} - -uint16_t psm3_gen1_get_user_minor_version(void) -{ - return sw_version.minor; -} - -void psm3_gen1_set_user_version(uint32_t version) -{ - sw_version.version = version; -} - -int psm3_gen1_nic_context_open_ex(int unit, int port, uint64_t open_timeout, - char *dev_name,size_t dev_name_len) -{ - int fd; - - //psmi_assert_always(unit >= 0); - snprintf(dev_name, dev_name_len, "%s_%u", HFI_DEVICE_PATH_GEN1, - unit); - - if ((fd = open(dev_name, O_RDWR)) == -1) { - _HFI_DBG("(host:Can't open %s for reading and writing", - dev_name); - return -1; - } - - if (fcntl(fd, F_SETFD, FD_CLOEXEC)) - _HFI_INFO("Failed to set close on exec for device: %s\n", - strerror(errno)); - -#ifdef PSM2_SUPPORT_IW_CMD_API - { - /* if hfi1DriverMajor == -1, then we are potentially talking to a new driver. - Let's confirm by issuing an ioctl version request: */ - struct hfi1_cmd c; - - memset(&c, 0, sizeof(struct hfi1_cmd)); - c.type = PSMI_HFI_CMD_GET_VERS; - c.len = 0; - c.addr = 0; - - if (psm3_gen1_nic_cmd_write(fd, &c, sizeof(c)) == -1) { - /* Let's assume that the driver is the old driver */ - psm3_gen1_set_user_major_version(IOCTL_CMD_API_MODULE_MAJOR - 1); - /* the old driver uses write() for its command interface: */ - psm3_gen1_nic_cmd_send = psm3_gen1_old_nic_cmd_write; - } - else - { - int major = c.addr >> HFI1_SWMAJOR_SHIFT; - if (major != psm3_gen1_get_user_major_version()) { - /* If there is a skew between the major version of the driver - that is executing and the major version which was used during - compilation of PSM, we treat that is a fatal error. */ - _HFI_INFO("PSM3 and driver version mismatch: (%d != %d)\n", - major, psm3_gen1_get_user_major_version()); - close(fd); - return -1; - } - } - } - -#endif - return fd; -} - -/* - * Check if non-double word multiple message size for SDMA is allowed to be - * pass to the driver. Starting from 6.2 driver version, PSM is able to pass - * to the driver message which size is not a multiple of double word for SDMA. - */ -uint32_t psm3_gen1_check_non_dw_mul_sdma(void) -{ - uint16_t major = psm3_gen1_get_user_major_version(); - uint16_t minor = psm3_gen1_get_user_minor_version(); - - if ((major > HFI1_USER_SWMAJOR_NON_DW_MUL_MSG_SIZE_ALLOWED) || - ((major == HFI1_USER_SWMAJOR_NON_DW_MUL_MSG_SIZE_ALLOWED) && - (minor >= HFI1_USER_SWMINOR_NON_DW_MUL_MSG_SIZE_ALLOWED))) - return 1; - - return 0; -} - -void psm3_gen1_nic_context_close(int fd) -{ - (void)close(fd); -} - -int psm3_gen1_nic_cmd_writev(int fd, const struct iovec *iov, int iovcnt) -{ - return writev(fd, iov, iovcnt); -} - -int psm3_gen1_nic_cmd_write(int fd, struct hfi1_cmd *cmd, size_t count) -{ - return psm3_gen1_nic_cmd_send(fd, cmd, count); -} - -ustatic -int psm3_gen1_old_nic_cmd_write(int fd, struct hfi1_cmd *cmd, size_t count) -{ - const static unsigned int cmdTypeToWriteNum[PSMI_HFI_CMD_LAST] = { - [PSMI_HFI_CMD_ASSIGN_CTXT] = LEGACY_HFI1_CMD_ASSIGN_CTXT, - [PSMI_HFI_CMD_CTXT_INFO] = LEGACY_HFI1_CMD_CTXT_INFO, - [PSMI_HFI_CMD_USER_INFO] = LEGACY_HFI1_CMD_USER_INFO, - [PSMI_HFI_CMD_TID_UPDATE] = LEGACY_HFI1_CMD_TID_UPDATE, - [PSMI_HFI_CMD_TID_FREE] = LEGACY_HFI1_CMD_TID_FREE, - [PSMI_HFI_CMD_CREDIT_UPD] = LEGACY_HFI1_CMD_CREDIT_UPD, - [PSMI_HFI_CMD_RECV_CTRL] = LEGACY_HFI1_CMD_RECV_CTRL, - [PSMI_HFI_CMD_POLL_TYPE] = LEGACY_HFI1_CMD_POLL_TYPE, - [PSMI_HFI_CMD_ACK_EVENT] = LEGACY_HFI1_CMD_ACK_EVENT, - [PSMI_HFI_CMD_SET_PKEY] = LEGACY_HFI1_CMD_SET_PKEY, - [PSMI_HFI_CMD_CTXT_RESET] = LEGACY_HFI1_CMD_CTXT_RESET, - [PSMI_HFI_CMD_TID_INVAL_READ] = LEGACY_HFI1_CMD_TID_INVAL_READ, - [PSMI_HFI_CMD_GET_VERS] = LEGACY_HFI1_CMD_GET_VERS, - }; - - if (cmd->type < PSMI_HFI_CMD_LAST) { - cmd->type = cmdTypeToWriteNum[cmd->type]; - - return psmi_write(fd, cmd, count); - } else { - errno = EINVAL; - return -1; - } -} - -#ifdef PSM2_SUPPORT_IW_CMD_API -ustatic -int psm3_gen1_nic_cmd_ioctl(int fd, struct hfi1_cmd *cmd, size_t count) -{ - uint64_t addrOrLiteral[2] = { (uint64_t)cmd->addr, (uint64_t)&cmd->addr }; - const static struct - { - unsigned int ioctlCmd; - unsigned int addrOrLiteralIdx; - } cmdTypeToIoctlNum[PSMI_HFI_CMD_LAST] = { - [PSMI_HFI_CMD_ASSIGN_CTXT] = {HFI1_IOCTL_ASSIGN_CTXT , 0}, - [PSMI_HFI_CMD_CTXT_INFO] = {HFI1_IOCTL_CTXT_INFO , 0}, - [PSMI_HFI_CMD_USER_INFO] = {HFI1_IOCTL_USER_INFO , 0}, - [PSMI_HFI_CMD_TID_UPDATE] = {HFI1_IOCTL_TID_UPDATE , 0}, - [PSMI_HFI_CMD_TID_FREE] = {HFI1_IOCTL_TID_FREE , 0}, - [PSMI_HFI_CMD_CREDIT_UPD] = {HFI1_IOCTL_CREDIT_UPD , 1}, - [PSMI_HFI_CMD_RECV_CTRL] = {HFI1_IOCTL_RECV_CTRL , 1}, - [PSMI_HFI_CMD_POLL_TYPE] = {HFI1_IOCTL_POLL_TYPE , 1}, - [PSMI_HFI_CMD_ACK_EVENT] = {HFI1_IOCTL_ACK_EVENT , 1}, - [PSMI_HFI_CMD_SET_PKEY] = {HFI1_IOCTL_SET_PKEY , 1}, - [PSMI_HFI_CMD_CTXT_RESET] = {HFI1_IOCTL_CTXT_RESET , 1}, - [PSMI_HFI_CMD_TID_INVAL_READ] = {HFI1_IOCTL_TID_INVAL_READ, 0}, - [PSMI_HFI_CMD_GET_VERS] = {HFI1_IOCTL_GET_VERS , 1}, -#ifdef PSM_CUDA - [PSMI_HFI_CMD_TID_UPDATE_V2] = {HFI1_IOCTL_TID_UPDATE_V2 , 0}, -#endif - }; - - if (cmd->type < PSMI_HFI_CMD_LAST) - return psmi_ioctl(fd, - cmdTypeToIoctlNum[cmd->type].ioctlCmd, - addrOrLiteral[cmdTypeToIoctlNum[cmd->type].addrOrLiteralIdx]); - else - { - errno = EINVAL; - return -1; - } -} -#endif /* #ifdef PSM2_SUPPORT_IW_CMD_API */ - -/* we use mmap64() because we compile in both 32 and 64 bit mode, - and we have to map physical addresses that are > 32 bits long. - While linux implements mmap64, it doesn't have a man page, - and isn't declared in any header file, so we declare it here ourselves. - - We'd like to just use -D_LARGEFILE64_SOURCE, to make off_t 64 bits and - redirects mmap to mmap64 for us, but at least through suse10 and fc4, - it doesn't work when the address being mapped is > 32 bits. It chips - off bits 32 and above. So we stay with mmap64. */ -void *psm3_gen1_mmap64(void *addr, size_t length, int prot, int flags, int fd, - __off64_t offset) -{ - return mmap64(addr, length, prot, flags, fd, offset); -} - -/* get the number of units supported by the driver. Does not guarantee */ -/* that a working chip has been found for each possible unit #. */ -/* number of units >=0 (0 means none found). */ -/* formerly used sysfs file "num_units" */ -int psm3_hfp_gen1_get_num_units(void) -{ - int ret = 0; - - while (1) { - char pathname[PATH_MAX]; - struct stat st; - int r; - - snprintf(pathname, sizeof(pathname), HFI_DEVICE_PATH_GEN1 "_%d", ret); - r = stat(pathname, &st); - if (r) break; - - ret++; - } - return ret; -} - -/* Given a unit number, returns 1 if any port on the unit is active. - * ports are also filtered based on PSM3_ADDR_FMT and PSM3_SUBNETS and - * ports without appropriate addresses are treated as not active - * returns <= 0 if no port on the unit is active. - */ -int psm3_gen1_get_unit_active(int unit, enum gen1_init_max_speed init_max_speed) -{ - int p, lid; - - for (p = HFI_MIN_PORT; p <= HFI_MAX_PORT; p++) { - lid = psm3_gen1_get_port_lid(unit, p, 0 /*addr_index*/, init_max_speed); - if (lid > 0) - break; - } - - if (p <= HFI_MAX_PORT) - { - return 1; - } - - return lid; -} - -/* deterine if there are any active units. - * returns 1 if at least 1 unfiltered, valid, active unit was found - * returns 0 if none found - * This routine is used during HAL selection prior to HAL initializion. - * This routine and the functions it calls may call utils_sysfs.c functions - * but cannot call any HAL routines (psmi_hal_*). - * psm3_sysfs_init will have been called prior to this to establish the sysfs - * path for devices in the HAL being checked - */ -int psm3_hfp_gen1_have_active_unit(int num_units) -{ - int i; - int ret = 0; - int find_max = ! psm3_nic_speed_wildcard - || (0 == strcmp(psm3_nic_speed_wildcard, "max")); - - psm3_nic_speed_max_found = 0; // reset from any previous HAL - for (i=0; i 0) { - _HFI_DBG("Found unfiltered active unit %d\n", i); - if (! find_max) - return 1; - ret = 1; - } else - _HFI_DBG("Skipping unit %d: Filtered or not active\n", i); - } - return ret; -} - -/* get the number of contexts from the unit id. */ -/* Returns 0 if no unit or no match. */ -int psm3_hfp_gen1_get_num_contexts(int unit_id) -{ -#if 0 - int n = 0; - int units, lid; - int64_t val; - uint32_t p = HFI_MIN_PORT; - - units = psm3_hfp_gen1_get_num_units(); - - if_pf(units <= 0) - return 0; - -#if 0 - // never called with NIC_ANY. This would tabulate total contexts - // for all units in the system - if (unit_id == PSM3_NIC_ANY) { - uint32_t u; - - for (u = 0; u < units; u++) { - for (p = HFI_MIN_PORT; p <= HFI_MAX_PORT; p++) { - lid = psm3_gen1_get_port_lid(u, p, 0 /*addr_index*/, GEN1_FILTER); - if (lid > 0) - break; - } - - if (p <= HFI_MAX_PORT && - !psm3_sysfs_unit_read_s64(u, "nctxts", &val, 0)) - n += (uint32_t) val; - } - } else { -#else - { - //psmi_assert_always(unit_id >= 0); -#endif - for (; p <= HFI_MAX_PORT; p++) { - lid = psm3_gen1_get_port_lid(unit_id, p, 0 /*addr_index*/, GEN1_FILTER); - if (lid > 0) - break; - } - - if (p <= HFI_MAX_PORT && - !psm3_sysfs_unit_read_s64(unit_id, "nctxts", &val, 0)) - n += (uint32_t) val; - } - - return n; -#endif - int64_t nctxts=0; - - if (!psm3_sysfs_unit_read_s64(unit_id, "nctxts", &nctxts, 0)) - { - return (int)nctxts; - } - return 0; -} - -/* Given a unit number and port number, returns 1 if the unit and port are active. - returns 0 if the unit and port are not active. - returns -1 when an error occurred. */ -int psm3_hfp_gen1_get_port_active(int unit, int port) -{ - int ret; - char *state; - ret = psm3_sysfs_port_read(unit, port, "phys_state", &state); - if (ret == -1) { - if (errno == ENODEV) - /* this is "normal" for port != 1, on single port chips */ - _HFI_VDBG - ("Failed to get phys_state for unit %u:%u: %s\n", - unit, port, strerror(errno)); - else - _HFI_DBG - ("Failed to get phys_state for unit %u:%u: %s\n", - unit, port, strerror(errno)); - return -1; - } else { - if (strncmp(state, "5: LinkUp", 9)) { - _HFI_DBG("Link is not Up for unit %u:%u\n", unit, port); - psm3_sysfs_free(state); - return 0; - } - psm3_sysfs_free(state); - return 1; - } -} - -/* Given the unit number, port and addr_index - * return an error, or the corresponding LID - * Used so the MPI code can determine it's own - * LID, and which other LIDs (if any) are also assigned to this node - * Returns an int, so <0 indicates an error. 0 may indicate that - * the unit is valid, but no LID has been assigned. - * - * This routine is used in many places, such as get_unit_active, to - * confirm the port is usable. As such it includes additional checks that - * the port is active and has an appropriate address based on PSM3_ADDR_FMT - * and PSM3_SUBNETS. Ports without appropriate addresses are treated as not - * initialized and return -1. - * - * For IB/OPA - actual LID is returned, values of 0 indicate - * port is not yet ready for use - * A LID of 0xffff causes a return of 0 as this is an uninitialized IB LID - * For Ethernet (IPv4 or IPv6, RoCE or UDP) 1 is always reported (or <0 for err) - * - * No error print because we call this for both potential - * ports without knowing if both ports exist (or are connected) - */ -int psm3_gen1_get_port_lid(int unit, int port, int addr_index, enum gen1_init_max_speed init_max_speed) -{ - int ret = 0; - int64_t val = 0; - uint64_t speed; - - if (port < HFI_MIN_PORT || port > HFI_MAX_PORT) - return -1; - if (addr_index < 0 || addr_index > psm3_addr_per_nic) - return -1; - - if (psm3_hfp_gen1_get_port_active(unit,port) != 1) - return -2; - // make sure the port matches the wildcard - if (1 != psm3_is_nic_allowed(unit)) - return -1; - - ret = psm3_sysfs_port_read_s64(unit, port, "lid", &val, 0); - _HFI_VDBG("ret %d, unit %d port %d lid %ld\n", ret, unit, - port, (long int)val); - if (ret < 0) { - if (errno == ENODEV) - /* this is "normal" for port != 1, on single port chips */ - _HFI_VDBG("Failed to get LID for unit %u:%u: %s\n", - unit, port, strerror(errno)); - else - _HFI_DBG("Failed to get LID for unit %u:%u: %s\n", - unit, port, strerror(errno)); - return -1; - } - // For OPA, PSM3_ADDR_PER_NIC is essentially ignored and addr_index>0 - // reports no LID available. In future could use addr_index to select - // among the LMC LIDs and check LMC has > PSM3_ADDR_PER_NIC here and in - // get_port_subnet filtering of ports - if (addr_index > 0) { - _HFI_DBG("Only addr_index 0 supported for OPA for unit %u:%u\n", - unit, port); - return 0; - } - // be paranoid, for an active port we should have a valid - // LID 1-0xfffe (technically 1-0xbffff due to multicast) - if (val == 0xffff) // uninitialized IB LID - val = 0; // simplify job for callers - if (! val) { - _HFI_DBG("Uninitialized LID for unit %u:%u\n", - unit, port); - // no need to check other filters, can't use this unit - return 0; - } - ret = val; // LID we got - - if (init_max_speed != GEN1_NOFILTER) { - if (0 != psm3_hfp_gen1_get_port_speed(unit, port, &speed)) { - _HFI_DBG("Failed to get port speed for unit %u:%u: %s\n", - unit, port, strerror(errno)); - return -1; - } - if (init_max_speed == GEN1_FINDMAX) { - if (speed > psm3_nic_speed_max_found) { - psm3_nic_speed_max_found = speed; - _HFI_DBG("Updated max NIC speed unit %u:%u: %"PRIu64"\n", - unit, port, speed); - } - } else if (1 != psm3_is_speed_allowed(unit, speed)) { - return -1; - } - } - -/* disable this feature since we don't have a way to provide - file descriptor in multiple context case. */ -#if 0 - if (getenv("PSM3_DIAG_LID_LOOP")) { - /* provides diagnostic ability to run MPI, etc. even */ - /* on loopback, by claiming a different LID for each context */ - struct hfi1_ctxt_info info; - struct hfi1_cmd cmd; - cmd.type = PSMI_HFI_CMD_CTXT_INFO; - cmd.cmd.ctxt_info = (uintptr_t) &info; - if (__hfi_lastfd == -1) - _HFI_INFO - ("Can't run CONTEXT_INFO for lid_loop, fd not set\n"); - else if (write(__hfi_lastfd, &cmd, sizeof(cmd)) == -1) - _HFI_INFO("CONTEXT_INFO command failed: %s\n", - strerror(errno)); - else if (!info.context) - _HFI_INFO("CONTEXT_INFO returned context 0!\n"); - else { - _HFI_PRDBG - ("Using lid 0x%x, base %x, context %x\n", - ret + info.context, ret, info.context); - ret += info.context; - } - } -#endif // 0 - - return ret; -} - -/* Given the unit number, return an error, or the corresponding GID - * When filter is set, we will ignore GIDs which aren't a "RoCE v2" type - * (other possible types are "IB/RoCE v1" or "Invalid GID type") - * Returns 0 on success, -1 on error. - * No error print because we call this for both potential - * ports without knowing if both ports exist (or are connected) - */ -static int psm3_gen1_get_port_gid(int unit, int port, int idx, int filter, - psmi_gid128_t *gidp) -{ - int ret; - char *gid_str = NULL; - char attr_str[64]; - - snprintf(attr_str, sizeof(attr_str), "gids/%d", idx < 0 ? 0 : idx); - ret = psm3_sysfs_port_read(unit, port, attr_str, &gid_str); - if (ret == -1) { - if (errno == ENODEV) - /* this is "normal" for port != 1, on single - * port chips */ - _HFI_VDBG("Failed to get GID %d for unit %u:%u: %s\n", - idx, unit, port, strerror(errno)); - else - _HFI_DBG("Failed to get GID %d for unit %u:%u: %s\n", - idx, unit, port, strerror(errno)); - } else { - uint32_t gid[8] = {0}; - if (sscanf(gid_str, "%4x:%4x:%4x:%4x:%4x:%4x:%4x:%4x", - &gid[0], &gid[1], &gid[2], &gid[3], - &gid[4], &gid[5], &gid[6], &gid[7]) != 8) { - _HFI_DBG("Failed to parse GID %d for unit %u:%u: %s\n", - idx, unit, port, gid_str); - errno = EINVAL; - ret = -1; - } else { - gidp->hi = (((uint64_t) gid[0]) << 48) - | (((uint64_t) gid[1]) << 32) - | (((uint64_t) gid[2]) << 16) - | (((uint64_t) gid[3]) << 0); - gidp->lo = (((uint64_t) gid[4]) << 48) - | (((uint64_t) gid[5]) << 32) - | (((uint64_t) gid[6]) << 16) - | (((uint64_t) gid[7]) << 0); - ret = 0; - } - psm3_sysfs_free(gid_str); - } - if (0 == ret && filter && (gidp->lo || gidp->hi)) { - snprintf(attr_str, sizeof(attr_str), "gid_attrs/types/%d", idx < 0 ? 0 : idx); - ret = psm3_sysfs_port_read(unit, port, attr_str, &gid_str); - if (ret == -1) { - _HFI_DBG("Failed to get GID type for unit %u:%u idx %d: %s\n", - unit, port, idx, strerror(errno)); - } else { - /* gid_str includes newline, ignore it */ - if (strncmp(gid_str, "RoCE v2", strlen("RoCE v2"))) { - /* treat filtered entries as empty */ - _HFI_DBG("Filtered out GID unit %d port %d idx %d %s %s", - unit, port, idx, - psm3_gid128_fmt(*gidp, 0), gid_str); - gidp->hi = gidp->lo = 0; - } - psm3_sysfs_free(gid_str); - ret = 0; - } - } - - return ret; -} - -/* Given the unit number, port and addr_index, - * return an error, or the corresponding subnet - * address and GID selected for the unit/port/addr_index - * For IB/OPA the subnet.hi is the hi 64b of the GID, subnet.lo is 0 - * addr is the 128b GID - * prefix_len is always 64 - * For Ethernet IPv4: the subnet is derived from the IPv4 address and netmask - * subnet.hi is 0 - * subnet.lo is the IPv4 address & netmask - * addr.lo is the full 32 bit IPv4 address, addr.hi is 0 - * prefix_len also returned (1-32) - * For Ethernet IPv6: the subnet is the 128b subnet of the 1st non-IPv4 GID - * addr is the full 128b IPv6 address - * prefix_len also returned (1-128) - * idx and gid are always the full GID (RoCEv2 IPv4 style when IPv4 address) - * All output values are in host byte order - * Note this layout means (subnet | addr) == addr for all formats - * - * PSM3_FMT_ADDR (psm3_addr_fmt) sets preferred address type. - * 0 (default) - consider all ports - * For Ethernet return first IPv4 addr found, if no IPv4 return 1st IPv6 - * For OPA/IBA return 1st GID found - * FMT_IPATH, FMT_OPA - Native, only called for OPA ports, return 1st GID found - * FMT_IB - only consider IB/OPA ports - * FMT_IPV4 - only consider Ethernet ports with IPv4 addresses (return first) - * FMT_IPV6 - only consider Ethernet ports with IPv6 addresses (return first) - * When FMT_IB, FMT_IPV4 or FMT_IPV6 specified, non-matching ports return -1. - * - * Returns 0 on success, -1 on error. - * - * No error print because we call this for all potential - * ports of a unit without knowing if each port exists (or is connected) - * For Ethernet a unit will only have a single port (port 1), for IB a unit - * may have more than 1 port. -*/ -int psm3_hfp_gen1_get_port_subnet(int unit, int port, int addr_index, - psmi_subnet128_t *subnet, psmi_naddr128_t *addr, - int *idx, psmi_gid128_t *gid) -{ - int i; - int have_subnet = 0; - - if (addr_index < 0 || addr_index > psm3_addr_per_nic) { - errno = EINVAL; - return -1; - } - // for OPA we only allow addr_index==0 even if PSM3_ADDR_PER_NIC>1 - // In future might use addr_index to select among the LMC LIDs - if (addr_index > 0) { - _HFI_DBG("Skipped OPA unit %d port %d addr_index %d\n", unit, port, addr_index); - return -1; - } - for (i =0; ; i++) { - psmi_gid128_t tmp_gid; - if (-1 == psm3_gen1_get_port_gid(unit, port, i, 0, &tmp_gid)) - break; // stop at 1st non-existent gid (or non-existent port) - // Skip over empty gid table entries. - // for IB/OPA, the same SubnetPrefix is used for all entries - // so just examine low 64 bits (InterfaceId) - if (tmp_gid.lo == 0) - continue; - // save 1st valid gid, this is answer - if (idx) *idx = i; - if (subnet) *subnet = psm3_build_ib_subnet128(tmp_gid.hi); - if (addr) *addr = psm3_build_ib_naddr128(tmp_gid); - if (gid) *gid = tmp_gid; - have_subnet = 1; - break; // stop at 1st valid gid - } - if (have_subnet) - return 0; - errno = ENXIO; - return -1; -} - -/* in units of bits/sec */ -int psm3_hfp_gen1_get_port_speed(int unit, int port, uint64_t *speed) -{ - char *speedstr = NULL; - int ret = psm3_sysfs_port_read(unit, port, "rate", &speedstr); - if (ret == -1) { - _HFI_DBG("Failed to port speed for unit %u/%u: %s\n", - unit, port, strerror(errno)); - return ret; - } - uint32_t gbps; - int n = sscanf(speedstr, "%u Gb/sec", &gbps); - if (n != 1) { - _HFI_DBG("Failed to parse port speed(%s) for unit %u/%u: sccanf ret = %d\n", - speedstr, unit, port, n); - ret = -1; - goto free; - } - if (speed) *speed = (uint64_t)gbps * 1000 * 1000 * 1000; - _HFI_VDBG("Got speed for for unit/port %d/%d: %u Gb/s\n", - unit, port, gbps); -free: - psm3_sysfs_free(speedstr); - return ret < 0 ? -1 : 0; -} - -/* Given the unit number, return an error, or the corresponding LMC value - for the port */ -/* Returns an int, so -1 indicates an error. 0 */ -int psm3_gen1_get_port_lmc(int unit, int port) -{ - int ret; - int64_t val; - - ret = psm3_sysfs_port_read_s64(unit, port, "lid_mask_count", &val, 0); - - if (ret == -1) { - _HFI_INFO("Failed to get LMC for unit %u:%u: %s\n", - unit, port, strerror(errno)); - } else - ret = val; - - return ret; -} - -/* Given a unit, port and SL, return an error, or the corresponding SC for the - SL as programmed by the SM */ -/* Returns an int, so -1 indicates an error. */ -int psm3_gen1_get_port_sl2sc(int unit, int port, int sl) -{ - int ret; - int64_t val; - char sl2scpath[16]; - - snprintf(sl2scpath, sizeof(sl2scpath), "sl2sc/%d", sl); - ret = psm3_sysfs_port_read_s64(unit, port, sl2scpath, &val, 0); - - if (ret == -1) { - _HFI_DBG - ("Failed to get SL2SC mapping for SL %d unit %u:%u: %s\n", - sl, unit, port, strerror(errno)); - } else - ret = val; - - return ret; -} - -/* Given a unit, port and SC, return an error, or the corresponding VL for the - SC as programmed by the SM */ -/* Returns an int, so -1 indicates an error. */ -int psm3_gen1_get_port_sc2vl(int unit, int port, int sc) -{ - int ret; - int64_t val; - char sc2vlpath[16]; - - snprintf(sc2vlpath, sizeof(sc2vlpath), "sc2vl/%d", sc); - ret = psm3_sysfs_port_read_s64(unit, port, sc2vlpath, &val, 0); - - if (ret == -1) { - _HFI_DBG - ("Failed to get SC2VL mapping for SC %d unit %u:%u: %s\n", - sc, unit, port, strerror(errno)); - } else - ret = val; - - return ret; -} - -/* Given a unit, port and VL, return an error, or the corresponding MTU for the - VL as programmed by the SM */ -/* Returns an int, so -1 indicates an error. */ -int psm3_gen1_get_port_vl2mtu(int unit, int port, int vl) -{ - int ret; - int64_t val; - char vl2mtupath[16]; - - snprintf(vl2mtupath, sizeof(vl2mtupath), "vl2mtu/%d", vl); - ret = psm3_sysfs_port_read_s64(unit, port, vl2mtupath, &val, 0); - - if (ret == -1) { - _HFI_DBG - ("Failed to get VL2MTU mapping for VL %d unit %u:%u: %s\n", - vl, unit, port, strerror(errno)); - } else - ret = val; - - return ret; -} - -/* Given a unit, port and index, return an error, or the corresponding pkey - value for the index as programmed by the SM */ -/* Returns an int, so -1 indicates an error. */ -int psm3_gen1_get_port_index2pkey(int unit, int port, int index) -{ - int ret; - int64_t val; - char index2pkeypath[16]; - - snprintf(index2pkeypath, sizeof(index2pkeypath), "pkeys/%d", index); - ret = psm3_sysfs_port_read_s64(unit, port, index2pkeypath, &val, 0); - - if (ret == -1) { - _HFI_DBG - ("Failed to get index2pkey mapping for index %d unit %u:%u: %s\n", - index, unit, port, strerror(errno)); - } else - ret = val; - - return ret; -} - -int psm3_gen1_get_cc_settings_bin(int unit, int port, char *ccabuf, size_t len_ccabuf) -{ - int fd; - - /* - * 4 bytes for 'control map' - * 2 bytes 'port control' - * 32 (#SLs) * 6 bytes 'congestion setting' (per-SL) - */ - const size_t count = 4 + 2 + (32 * 6); - const char *unitpath = psm3_sysfs_unit_path(unit); - - if (count > len_ccabuf) - return -2; -/* - * Check qib driver CCA setting, and try to use it if available. - * Fall to self CCA setting if errors. - */ - if (unitpath == NULL - || snprintf(ccabuf, len_ccabuf, "%s/ports/%d/CCMgtA/cc_settings_bin", - unitpath, port) >= (len_ccabuf-1)) - return -1; - - fd = open(ccabuf, O_RDONLY); - if (fd < 0) { - return 0; - } - - if (read(fd, ccabuf, count) != count) { - _HFI_CCADBG("Read cc_settings_bin failed. using static CCA\n"); - close(fd); - return 0; - } - - close(fd); - - return 1; -} - -int psm3_gen1_get_cc_table_bin(int unit, int port, uint16_t **cctp) -{ - int i; - unsigned short ccti_limit; - uint16_t *cct; - int fd; - char pathname[256]; - *cctp = NULL; - const char *unitpath = psm3_sysfs_unit_path(unit); - - if (unitpath == NULL - || snprintf(pathname,sizeof(pathname), "%s/ports/%d/CCMgtA/cc_table_bin", - unitpath, port) >= (sizeof(pathname)-1)) - return -1; - - fd = open(pathname, O_RDONLY); - if (fd < 0) { - _HFI_CCADBG("Open cc_table_bin failed. using static CCA\n"); - return 0; - } - if (read(fd, &ccti_limit, sizeof(ccti_limit)) != sizeof(ccti_limit)) { - _HFI_CCADBG("Read ccti_limit failed. using static CCA\n"); - close(fd); - return 0; - } - - _HFI_CCADBG("ccti_limit = %d\n", ccti_limit); - - if (ccti_limit < 63) { - _HFI_CCADBG("Read ccti_limit %d not in range [63, 65535], " - "using static CCA.\n", ccti_limit); - close(fd); - return 0; - } - - i = (ccti_limit + 1) * sizeof(uint16_t); - cct = malloc(i); - if (!cct) { - close(fd); - return -1; - } - if (read(fd, cct, i) != i) { - _HFI_CCADBG("Read ccti_entry_list, using static CCA\n"); - free(cct); - close(fd); - return 0; - } - - close(fd); - - _HFI_CCADBG("cct[0] = 0x%04x\n", cct[0]); - - *cctp = cct; - return ccti_limit; -} - -/* - * This is for diag function psm3_gen1_wait_for_packet() only - */ -int psm3_gen1_cmd_wait_for_packet(int fd) -{ - int ret; - struct pollfd pfd; - - pfd.fd = fd; - pfd.events = POLLIN; - - ret = poll(&pfd, 1, 500 /* ms */); - - return ret; -} -#endif /* PSM_OPA */ diff --git a/psm3/hal_gen1/gen1_service.h b/psm3/hal_gen1/gen1_service.h deleted file mode 100644 index c5a1f12..0000000 --- a/psm3/hal_gen1/gen1_service.h +++ /dev/null @@ -1,256 +0,0 @@ -#ifdef PSM_OPA -/* - - This file is provided under a dual BSD/GPLv2 license. When using or - redistributing this file, you may do so under either license. - - GPL LICENSE SUMMARY - - Copyright(c) 2015 Intel Corporation. - - This program is free software; you can redistribute it and/or modify - it under the terms of version 2 of the GNU General Public License as - published by the Free Software Foundation. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - Contact Information: - Intel Corporation, www.intel.com - - BSD LICENSE - - Copyright(c) 2015 Intel Corporation. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#ifndef PSM_HAL_GEN1_SERVICE_H -#define PSM_HAL_GEN1_SERVICE_H - -/* This file contains all the lowest level routines calling into sysfs */ -/* and qib driver. All other calls are based on these routines. */ - -#ifndef _GNU_SOURCE -#define _GNU_SOURCE /* See feature_test_macros(7) */ -#endif -#include /* cpu_set_t and CPU_* MACROs */ -#include - -#include "utils_user.h" -#include "gen1_types.h" -#include "gen1_common.h" -#include "psm_netutils.h" - -/* HAL specific upper and lower bounds for NIC port numbers */ -#define HFI_MIN_PORT 1 -#define HFI_MAX_PORT 1 -#ifndef HFI_NUM_PORTS_GEN1 -#define HFI_NUM_PORTS_GEN1 (HFI_MAX_PORT - HFI_MIN_PORT + 1) -#endif - -/* base name of path (without unit #) for qib driver */ -#ifndef HFI_DEVICE_PATH_GEN1 -#define HFI_DEVICE_PATH_GEN1 "/dev/hfi1" -#endif - -#ifdef PSM_CUDA -#define GDR_DEVICE_PATH "/dev/hfi1_gdr" -#endif - -/* The major and minor versions of driver that support non-DW multiple SDMA */ -#define HFI1_USER_SWMAJOR_NON_DW_MUL_MSG_SIZE_ALLOWED 6 -#define HFI1_USER_SWMINOR_NON_DW_MUL_MSG_SIZE_ALLOWED 2 - -/* Commands used to communicate with driver. */ -enum PSMI_HFI_CMD { - PSMI_HFI_CMD_ASSIGN_CTXT = 0, /* allocate HFI and context */ - PSMI_HFI_CMD_CTXT_INFO, /* find out what resources we got */ - PSMI_HFI_CMD_USER_INFO, /* set up userspace */ - PSMI_HFI_CMD_TID_UPDATE, /* update expected TID entries */ - PSMI_HFI_CMD_TID_FREE, /* free expected TID entries */ - PSMI_HFI_CMD_CREDIT_UPD, /* force an update of PIO credit */ - PSMI_HFI_CMD_RECV_CTRL, /* control receipt of packets */ - PSMI_HFI_CMD_POLL_TYPE, /* set the kind of polling we want */ - PSMI_HFI_CMD_ACK_EVENT, /* ack & clear user status bits */ - PSMI_HFI_CMD_SET_PKEY, /* set context's pkey */ - PSMI_HFI_CMD_CTXT_RESET, /* reset context's HW send context */ - PSMI_HFI_CMD_TID_INVAL_READ, /* read TID cache invalidations */ - PSMI_HFI_CMD_GET_VERS, /* get the version of the user cdev */ - -#ifdef PSM_CUDA - PSMI_HFI_CMD_TID_UPDATE_V2 = 28, -#endif - PSMI_HFI_CMD_LAST, -}; - -/* Legacy commands used to communicate with driver using 'write' */ -enum LEGACY_HFI1_CMD { - LEGACY_HFI1_CMD_ASSIGN_CTXT = 1, /* allocate HFI and context */ - LEGACY_HFI1_CMD_CTXT_INFO = 2, /* find out what resources we got */ - LEGACY_HFI1_CMD_USER_INFO = 3, /* set up userspace */ - LEGACY_HFI1_CMD_TID_UPDATE = 4, /* update expected TID entries */ - LEGACY_HFI1_CMD_TID_FREE = 5, /* free expected TID entries */ - LEGACY_HFI1_CMD_CREDIT_UPD = 6, /* force an update of PIO credit */ - - LEGACY_HFI1_CMD_RECV_CTRL = 8, /* control receipt of packets */ - LEGACY_HFI1_CMD_POLL_TYPE = 9, /* set the kind of polling we want */ - LEGACY_HFI1_CMD_ACK_EVENT = 10, /* ack & clear user status bits */ - LEGACY_HFI1_CMD_SET_PKEY = 11, /* set context's pkey */ - LEGACY_HFI1_CMD_CTXT_RESET = 12, /* reset context's HW send context */ - LEGACY_HFI1_CMD_TID_INVAL_READ = 13, /* read TID cache invalidations */ - LEGACY_HFI1_CMD_GET_VERS = 14 /* get the version of the user cdev */ -}; - -/* Given a unit number and port number, returns 1 if the unit and port are active. - returns 0 if the unit and port are not active. returns -1 when an error occurred. */ -int psm3_hfp_gen1_get_port_active(int, int); - - -/* Given the unit number, port and addr_index, */ -/* return an error, or the corresponding LID */ -/* Returns an int, so -1 indicates a general error. -2 indicates that the unit/port - are not active. 0 indicates that the unit is valid, but no LID has been assigned. */ -enum gen1_init_max_speed { GEN1_NOFILTER, GEN1_FILTER, GEN1_FINDMAX }; -int psm3_gen1_get_port_lid(int, int, int, enum gen1_init_max_speed init_max_speed); - -/* Given the unit number, port and addr_index, return an error, or the corresponding */ -/* subnet, addr and gid. For ethernet uses 1st IPv4 RoCE gid. */ -/* For IB/OPA uses 1st valid gid */ -/* Returns an int, so -1 indicates an error. */ -int psm3_hfp_gen1_get_port_subnet(int unit, int port, int addr_index, - psmi_subnet128_t *subnet, psmi_naddr128_t *addr, - int *idx, psmi_gid128_t *gid); - -/* Given a unit and port umber, return an error, or the corresponding speed in bps. */ -/* Returns an int, so -1 indicates an error. 0 on success */ -int psm3_hfp_gen1_get_port_speed(int unit, int port, uint64_t *speed); - -/* Given the unit number, return an error, or the corresponding LMC value - for the port */ -/* Returns an int, so -1 indicates an error. 0 */ -int psm3_gen1_get_port_lmc(int unit, int port); - -/* Given a unit, port and SL, return an error, or the corresponding SC for the - SL as programmed by the SM */ -/* Returns an int, so -1 indicates an error. */ -int psm3_gen1_get_port_sl2sc(int unit, int port, int sl); - -/* Given a unit, port and SC, return an error, or the corresponding VL for the - SC as programmed by the SM */ -/* Returns an int, so -1 indicates an error. */ -int psm3_gen1_get_port_sc2vl(int unit, int port, int sc); - -/* Given a unit, port and VL, return an error, or the corresponding MTU for the - VL as programmed by the SM */ -/* Returns an int, so -1 indicates an error. */ -int psm3_gen1_get_port_vl2mtu(int unit, int port, int vl); - -/* Given a unit, port and index, return an error, or the corresponding pkey for - the index as programmed by the SM */ -/* Returns an int, so -1 indicates an error. */ -int psm3_gen1_get_port_index2pkey(int unit, int port, int index); - -/* Get the number of units supported by the driver. Does not guarantee - that a working chip has been found for each possible unit #. - Returns -1 with errno set, or number of units >=0 (0 means none found). */ -int psm3_hfp_gen1_get_num_units(); - -/* Given a unit number, returns 1 if any port on the unit is active. - returns <=0 if no port on the unit is active. */ -int psm3_gen1_get_unit_active(int unit, enum gen1_init_max_speed init_max_speed); - -/* Given a number of units, returns 1 if any port on the units is active - returns <= 0 if no port on any of the units is active. */ -int psm3_hfp_gen1_have_active_unit(int num_units); - -/* get the number of contexts from the unit id. */ -int psm3_hfp_gen1_get_num_contexts(int unit); - -/* Open hfi device file, return -1 on error. */ -int psm3_gen1_nic_context_open_ex(int unit, int port, uint64_t open_timeout, - char *dev_name,size_t dev_name_len); - -uint32_t psm3_gen1_check_non_dw_mul_sdma(void); - -void psm3_gen1_nic_context_close(int fd); - -/* psm3_gen1_get_user_major_version() returns the major version of the driver - that should be used for this session of psm. Valid only after - psm3_gen1_nic_context_open_ex has been called. */ -uint16_t psm3_gen1_get_user_major_version(void); - -/* psm3_gen1_get_user_minor_version() return the minor version of the driver */ -uint16_t psm3_gen1_get_user_minor_version(void); - -void psm3_gen1_set_user_version(uint32_t version); -void psm3_gen1_set_user_major_version(uint16_t major_version); - -int psm3_gen1_nic_cmd_write(int fd, struct hfi1_cmd *, size_t count); - -int psm3_gen1_nic_cmd_writev(int fd, const struct iovec *iov, int iovcnt); - -/* psm3_gen1_get_cc_settings_bin() returns less than or equal to 0 on failure, - returns greater than 0 on success. */ - int psm3_gen1_get_cc_settings_bin(int unit, int port, char *ccabuf, size_t len_ccabuf); -int psm3_gen1_get_cc_table_bin(int unit, int port, uint16_t **cctp); - -/* We use mmap64() because we compile in both 32 and 64 bit mode, - and we have to map physical addresses that are > 32 bits long. - While linux implements mmap64, it doesn't have a man page, - and isn't declared in any header file, so we declare it here ourselves. */ - -/* We'd like to just use -D_LARGEFILE64_SOURCE, to make off_t 64 bits and - redirects mmap to mmap64 for us, but at least through suse10 and fc4, - it doesn't work when the address being mapped is > 32 bits. It chips - off bits 32 and above. So we stay with mmap64. */ -extern void *mmap64(void *, size_t, int, int, int, __off64_t); -void *psm3_gen1_mmap64(void *, size_t, int, int, int, __off64_t); - -/* Statistics maintained by the driver */ -int psm3_gen1_get_stats(uint64_t *, int); -int psm3_gen1_get_stats_names(char **namep); -int psm3_gen1_get_stats_names_count(void); -const char *psm3_gen1_get_next_name(char **names); -void psm3_gen1_release_names(char *namep); -/* Counters maintained in the chip, globally, and per-prot */ -int psm3_gen1_get_ctrs_unit(int unitno, uint64_t *, int); -int psm3_gen1_get_ctrs_unit_names(int unitno, char **namep); -int psm3_gen1_get_ctrs_unit_names_count(int unitno); -int psm3_gen1_get_ctrs_port(int unitno, int port, uint64_t *, int); -int psm3_gen1_get_ctrs_port_names(int unitno, char **namep); -int psm3_gen1_get_ctrs_port_names_count(int unitno); -uint64_t psm3_gen1_get_single_unitctr(int unit, const char *attr, uint64_t *s); -int psm3_gen1_get_single_portctr(int unit, int port, const char *attr, uint64_t *c); - -int psm3_gen1_cmd_wait_for_packet(int fd); - -#endif /* PSM_HAL_GEN1_SERVICE_H */ -#endif /* PSM_OPA */ diff --git a/psm3/hal_gen1/gen1_spio.c b/psm3/hal_gen1/gen1_spio.c deleted file mode 100644 index 20ebbd9..0000000 --- a/psm3/hal_gen1/gen1_spio.c +++ /dev/null @@ -1,998 +0,0 @@ -#ifdef PSM_OPA -/* - - This file is provided under a dual BSD/GPLv2 license. When using or - redistributing this file, you may do so under either license. - - GPL LICENSE SUMMARY - - Copyright(c) 2017 Intel Corporation. - - This program is free software; you can redistribute it and/or modify - it under the terms of version 2 of the GNU General Public License as - published by the Free Software Foundation. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - Contact Information: - Intel Corporation, www.intel.com - - BSD LICENSE - - Copyright(c) 2017 Intel Corporation. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -/* Copyright (c) 2003-2017 Intel Corporation. All rights reserved. */ - -#ifndef _GEN1_SPIO_C_ -#define _GEN1_SPIO_C_ - -/* included header files */ -#include -#include -#include -#include -#include - -#include "ips_proto.h" -#include "ips_proto_internal.h" -#include "gen1_spio.h" -#include "ips_proto_params.h" - -/* Report PIO stalls every 20 seconds at the least */ -#define SPIO_STALL_WARNING_INTERVAL (nanosecs_to_cycles(20e9)) -#define SPIO_MAX_CONSECUTIVE_SEND_FAIL (1<<20) /* 1M */ -/* RESYNC_CONSECUTIVE_SEND_FAIL has to be a multiple of MAX_CONSECUTIVE */ -#define SPIO_RESYNC_CONSECUTIVE_SEND_FAIL (1<<4) /* 16 */ - -static void psm3_gen1_spio_report_stall(struct psm3_gen1_spio *ctrl, - uint64_t t_cyc_now, uint64_t send_failures); - -static void psm3_gen1_spio_handle_stall(struct psm3_gen1_spio *ctrl, uint64_t send_failures); - -static psm2_error_t psm3_gen1_spio_reset_hfi(struct psm3_gen1_spio *ctrl); -static psm2_error_t psm3_gen1_spio_reset_hfi_shared(struct psm3_gen1_spio *ctrl); -static psm2_error_t psm3_gen1_spio_credit_return_update(struct psm3_gen1_spio *ctrl); -static psm2_error_t psm3_gen1_spio_credit_return_update_shared(struct psm3_gen1_spio *ctrl); - -static inline psm2_error_t -psm3_gen1_spio_init_internal(const struct psmi_context *context, struct ptl *ptl, - struct psm3_gen1_spio *ctrl -#ifdef PSM_AVX512 - , int is_avx512_enabled -#endif - ) -{ - cpuid_t id; - hfp_gen1_pc_private *psm_hw_ctxt = context->psm_hw_ctxt; - struct _hfi_ctrl *con_ctrl = psm_hw_ctxt->ctrl; - - ctrl->ptl = ptl; - ctrl->context = context; - ctrl->unit_id = context->ep->unit_id; - ctrl->portnum = context->ep->portnum; - - pthread_spin_init(&ctrl->spio_lock, PTHREAD_PROCESS_PRIVATE); - ctrl->spio_credits_addr = (volatile __le64 *) con_ctrl->base_info.sc_credits_addr; - ctrl->spio_bufbase_sop = (volatile uint64_t *)con_ctrl->base_info.pio_bufbase_sop; - ctrl->spio_bufbase = (volatile uint64_t *)con_ctrl->base_info.pio_bufbase; - - ctrl->spio_consecutive_failures = 0; - ctrl->spio_num_stall = 0ULL; - ctrl->spio_num_stall_total = 0ULL; - ctrl->spio_next_stall_warning = 0ULL; - ctrl->spio_last_stall_cyc = 0ULL; - ctrl->spio_init_cyc = get_cycles(); - - ctrl->spio_total_blocks = con_ctrl->ctxt_info.credits; - ctrl->spio_block_index = 0; - - ctrl->spio_ctrl = (struct psm3_gen1_spio_ctrl *)context->spio_ctrl; - if (!ctrl->spio_ctrl) { - ctrl->spio_ctrl = (volatile struct psm3_gen1_spio_ctrl *) - psmi_calloc(context->ep, UNDEFINED, 1, - sizeof(struct psm3_gen1_spio_ctrl)); - if (ctrl->spio_ctrl == NULL) { - return PSM2_NO_MEMORY; - } - - ctrl->spio_reset_hfi = psm3_gen1_spio_reset_hfi; - ctrl->spio_credit_return_update = - psm3_gen1_spio_credit_return_update; - } else { - ctrl->spio_reset_hfi = psm3_gen1_spio_reset_hfi_shared; - ctrl->spio_credit_return_update = - psm3_gen1_spio_credit_return_update_shared; - } - - /* - * Only the master process can initialize. - */ - if (psmi_hal_get_subctxt(context->psm_hw_ctxt) == 0) { - pthread_spin_init(&ctrl->spio_ctrl->spio_ctrl_lock, - PTHREAD_PROCESS_SHARED); - - ctrl->spio_ctrl->spio_write_in_progress = 0; - ctrl->spio_ctrl->spio_reset_count = 0; - ctrl->spio_ctrl->spio_frozen_count = 0; - - ctrl->spio_ctrl->spio_available_blocks = - ctrl->spio_total_blocks; - ctrl->spio_ctrl->spio_block_index = 0; - ctrl->spio_ctrl->spio_fill_counter = 0; - - psmi_assert(SPIO_CREDITS_Counter - (ctrl->spio_ctrl->spio_credits.value) == 0); - psmi_assert(SPIO_CREDITS_Status - (ctrl->spio_ctrl->spio_credits.value) == 0); - - ctrl->spio_ctrl->spio_credits.credit_return = - *ctrl->spio_credits_addr; - } - - /* - * Setup the PIO block copying routines. - */ - - get_cpuid(0x1, 0, &id); - - /* 16B copying supported */ - ctrl->spio_blockcpy_med = (id.edx & (1<spio_blockcpy_large = (id.ebx & (1<spio_blockcpy_med; - -#ifdef PSM_AVX512 - /* 64B copying supported */ - ctrl->spio_blockcpy_large = (is_avx512_enabled && (id.ebx & (1<spio_blockcpy_large; - -#endif - - -#ifdef PSM_CUDA - ctrl->cuda_pio_buffer = NULL; -#endif - - _HFI_PRDBG("psm3_gen1_spio_init() done\n"); - - return PSM2_OK; -} - -static inline int psm3_gen1_spio_init(const psmi_context_t *context, - struct ptl *ptl, void **ctrl) -{ - hfp_gen1_pc_private *psm_hw_ctxt = context->psm_hw_ctxt; - -#ifdef PSM_AVX512 - union psmi_envvar_val env_enable_avx512; - psm3_getenv("PSM3_AVX512", - "Enable (set envvar to 1) AVX512 code in PSM (Enabled by default)", - PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, - (union psmi_envvar_val)1, &env_enable_avx512); - int is_avx512_enabled = env_enable_avx512.e_int; - int rc = psm3_gen1_spio_init_internal(context,ptl, &psm_hw_ctxt->spio_ctrl, is_avx512_enabled); -#else - int rc = psm3_gen1_spio_init_internal(context,ptl, &psm_hw_ctxt->spio_ctrl); -#endif - if (rc >= 0) - { - *ctrl = &psm_hw_ctxt->spio_ctrl; - } - return rc; -} - -static inline psm2_error_t psm3_gen1_spio_fini_internal(struct psm3_gen1_spio *ctrl) -{ -#ifdef PSM_CUDA - if (PSMI_IS_GPU_ENABLED && ctrl->cuda_pio_buffer != NULL) - PSMI_CUDA_CALL(cuMemFreeHost, (void *) ctrl->cuda_pio_buffer); -#endif - psm3_gen1_spio_report_stall(ctrl, get_cycles(), 0ULL); - if (!ctrl->context->spio_ctrl) - psmi_free((void *)ctrl->spio_ctrl); - return PSM2_OK; -} - -static inline int psm3_gen1_spio_fini(void **ctrl, psmi_hal_hw_context ctxt) -{ - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - int rc = psm3_gen1_spio_fini_internal(&psm_hw_ctxt->spio_ctrl); - - if (!rc) - *ctrl = NULL; - return rc; -} - -static inline -void -psm3_gen1_spio_report_stall(struct psm3_gen1_spio *ctrl, uint64_t t_cyc_now, - uint64_t send_failures) -{ - size_t off = 0; - char buf[1024]; - - if (ctrl->spio_num_stall == 0) - return; - - if (send_failures > 0) { - char bufctr[128]; - uint64_t tx_stat, rx_stat; - int ret; - - off = snprintf(buf, sizeof(buf) - 1, - "PIO Send context %d with total blocks %d , available blocks %d, " - "fill counter %d, free counter %d ", - (int)psm3_epid_context(ctrl->context->epid), - ctrl->spio_total_blocks, - ctrl->spio_ctrl->spio_available_blocks, - ctrl->spio_ctrl->spio_fill_counter, - SPIO_CREDITS_Counter(ctrl->spio_ctrl-> - spio_credits.value)); - buf[off] = '\0'; - - /* In case hfifs isn't running */ - ret = psm3_gen1_get_single_portctr(ctrl->unit_id, ctrl->portnum, - "TxPkt", &tx_stat); - if (ret != -1) { - ret = psm3_gen1_get_single_portctr(ctrl->unit_id, - ctrl->portnum, "RxPkt", - &rx_stat); - if (ret != -1) { - snprintf(bufctr, sizeof(bufctr) - 1, - "(TxPktCnt=%llu,RxPktCnt=%llu)", - (unsigned long long)tx_stat, - (unsigned long long)rx_stat); - bufctr[sizeof(bufctr) - 1] = '\0'; - } else - bufctr[0] = '\0'; - } else - bufctr[0] = '\0'; - - _HFI_DBG - ("PIO Send Stall after at least %.2fM failed send attempts " - "(elapsed=%.3fs, last=%.3fs, pio_stall_count=%lld) %s %s\n", - send_failures / 1e6, - PSMI_CYCLES_TO_SECSF(t_cyc_now - ctrl->spio_init_cyc), - PSMI_CYCLES_TO_SECSF(t_cyc_now - - ctrl->spio_last_stall_cyc), - (unsigned long long)ctrl->spio_num_stall, - bufctr[0] != '\0' ? bufctr : "", buf); - } else { - _HFI_DBG - ("PIO Send Stall Summary: count=%llu, last=%.3fs, elapsed=%.3fs", - (unsigned long long)ctrl->spio_num_stall, - PSMI_CYCLES_TO_SECSF(t_cyc_now - ctrl->spio_init_cyc), - PSMI_CYCLES_TO_SECSF(t_cyc_now - - ctrl->spio_last_stall_cyc)); - } - - return; -} - -static inline void psm3_gen1_spio_handle_stall(struct psm3_gen1_spio *ctrl, uint64_t send_failures) -{ - uint64_t t_cyc_now = get_cycles(); - - /* We handle the pio-stall every time but only report something every 20 - * seconds. We print a summary at the end while closing the device */ - ctrl->spio_num_stall++; - ctrl->spio_num_stall_total++; - - if (ctrl->spio_next_stall_warning <= t_cyc_now) { - /* If context status is ok (i.e. no cables pulled or anything) */ - if (psm3_gen1_context_check_hw_status(((struct ptl_ips *)(ctrl->ptl))->ep) == PSM2_OK) - psm3_gen1_spio_report_stall(ctrl, t_cyc_now, send_failures); - ctrl->spio_next_stall_warning = - get_cycles() + SPIO_STALL_WARNING_INTERVAL; - } - - /* re-initialize our shadow from the real registers; by this time, - * we know the hardware has to have done the update. - * Also, kernel check may have changed things. - */ - ctrl->spio_credit_return_update(ctrl); - - ctrl->spio_last_stall_cyc = t_cyc_now; - - return; -} - -/* - * A send context halt is detected in several ways: - * 1. during pio for normal credit return update; - * 2. during events process when no event; - * when a hfi is frozen, we recover hfi by calling this routine. - */ -static inline void psm3_gen1_spio_reset_context(struct psm3_gen1_spio *ctrl) -{ - /* if there are too many reset, teardown process */ - ctrl->spio_ctrl->spio_reset_count++; - if (ctrl->spio_ctrl->spio_reset_count > IPS_CTXT_RESET_MAX) - psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, - "Too many send context reset, teardown...\n"); - - /* - * Because there are many epaddrs and many flows using the - * same PIO queue, it is hard to search all the unacked - * queue and find the correct retry point. Instead we just - * let the upper level flow control to NAK the packets and - * do the retry from the right point. - */ - - /* Call into driver to reset send context, driver will - * block this routine until the send context is actually - * reset. - */ - ips_wmb(); - if (psm3_gen1_hfi_reset_context(ctrl->context->psm_hw_ctxt)) - psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, - "Send context reset failed: %d.\n", errno); - - /* Reset spio shared control struct. */ - ctrl->spio_ctrl->spio_available_blocks = - ctrl->spio_total_blocks; - ctrl->spio_ctrl->spio_block_index = 0; - ctrl->spio_ctrl->spio_fill_counter = 0; - /* Get updated credit return again after reset. */ - ctrl->spio_ctrl->spio_credits.credit_return = - *ctrl->spio_credits_addr; - - psmi_assert(SPIO_CREDITS_Counter - (ctrl->spio_ctrl->spio_credits.value) == 0); - psmi_assert(SPIO_CREDITS_Status - (ctrl->spio_ctrl->spio_credits.value) == 0); -} - -/* - * hfi frozen is detected when checking events from driver, - * psm calls to check events in the main receive loop - * when there is no normal traffic. - */ -static inline void psm3_gen1_spio_reset_hfi_internal(struct psm3_gen1_spio *ctrl) -{ - struct ips_recvhdrq *recvq = &((struct ptl_ips *)(ctrl->ptl))->recvq; - struct ips_proto *proto = (struct ips_proto *)&((struct ptl_ips *)(ctrl->ptl))->proto; - - /* Reset receive queue state, this must be done first - * because after send context reset, hardware start to - * receive new packets. - */ - recvq->state->hdrq_head = 0; - recvq->state->rcv_egr_index_head = NO_EAGER_UPDATE; - recvq->state->num_hdrq_done = 0; - recvq->state->hdr_countdown = 0; - - /* set the expected sequence number to 1. */ - if (!(get_psm_gen1_hi()->hfp_private.dma_rtail)) - psm3_gen1_set_rhf_expected_sequence_number(1, recvq->gen1_cl_hdrq, - proto->ep->context.psm_hw_ctxt); - - /* Reset send context */ - psm3_gen1_spio_reset_context(ctrl); - - /* Reset sdma completion queue, this should be done last - * because when send context is reset, driver will complete - * all the sdma requests with error code -2. This error - * code is ignored by PSM, but other error codes are - * caught inside the routine. - */ - while (proto->sdma_done_index != proto->sdma_fill_index) - psm3_gen1_dma_completion_update(proto); -} - -static inline psm2_error_t psm3_gen1_spio_reset_hfi(struct psm3_gen1_spio *ctrl) -{ - /* Drain receive header queue before reset hfi, we use - * the main progression loop to do this so we return from - * here. - */ - if (!psm3_gen1_recvhdrq_isempty(&((struct ptl_ips *)(ctrl->ptl))->recvq)) - return PSM2_OK_NO_PROGRESS; - - /* do the real reset work: - * 1. reset receive header queue; - * 2. reset send context; - * 3. dain sdma completion queue; - */ - psm3_gen1_spio_reset_hfi_internal(ctrl); - - return PSM2_OK; -} - -/* - * There is a shared count and per process count, all initialized to - * zero. If a process' local count is equal to shared count, it is - * the first process and does the hfi reset, this process also move - * both counts up by one. If a process' local count is not equal to - * the shared count, it means other process has done the hfi reset, - * it just saves the shared count to local count and return. All the - * operation are locked by spio_ctrl_lock. - */ -static inline psm2_error_t psm3_gen1_spio_reset_hfi_shared(struct psm3_gen1_spio *ctrl) -{ - volatile struct psm3_gen1_spio_ctrl *spio_ctrl = ctrl->spio_ctrl; - - /* Drain receive header queue before reset hfi, we use - * the main progression loop to do this so we return from - * here. We don't reset software receive header queue. - */ - if (!psm3_gen1_recvhdrq_isempty(&((struct ptl_ips *)(ctrl->ptl))->recvq)) - return PSM2_OK_NO_PROGRESS; - - pthread_spin_lock(&spio_ctrl->spio_ctrl_lock); - - /* - * In context sharing mode, if there is a subcontext - * process in PIO writing, we need to wait till the PIO - * writing is done. So we spin wait here. If other - * process comes here and does the hfi reset, it should - * be perfectly fine. - */ - while (ctrl->spio_ctrl->spio_write_in_progress) { - pthread_spin_unlock(&spio_ctrl->spio_ctrl_lock); - usleep(1000); - pthread_spin_lock(&spio_ctrl->spio_ctrl_lock); - } - - if (ctrl->spio_frozen_count == ctrl->spio_ctrl->spio_frozen_count) { - ctrl->spio_frozen_count++; - ctrl->spio_ctrl->spio_frozen_count++; - - psm3_gen1_spio_reset_hfi_internal(ctrl); - } else - ctrl->spio_frozen_count = ctrl->spio_ctrl->spio_frozen_count; - - pthread_spin_unlock(&spio_ctrl->spio_ctrl_lock); - - return PSM2_OK; -} - -/* - * return value: - * PSM2_OK: new credits updated; - * PSM2_OK_NO_PROGRESS: no new credits; - */ -static inline psm2_error_t -psm3_gen1_spio_credit_return_update(struct psm3_gen1_spio *ctrl) -{ - uint64_t credit_return; - - credit_return = *ctrl->spio_credits_addr; - /* Update available blocks based on fill counter and free counter */ - if (ctrl->spio_ctrl->spio_credits.credit_return == credit_return) - return PSM2_OK_NO_PROGRESS; - - ctrl->spio_ctrl->spio_credits.credit_return = credit_return; - - /* If Status is set, then send context is halted */ - if (SPIO_CREDITS_Status(ctrl->spio_ctrl->spio_credits.value)) { - psm3_gen1_spio_reset_context(ctrl); - } else { - /* - * OPA1 has 1M PIO buffer, but each context can have max 64K, - * which is 1K 64B blocks, so the distance between fill counter - * and credit return counter is no more than 1024; Both fill - * counter and credit return counter are 11 bits value, - * representing range [0, 2047]. - */ - psmi_assert((ctrl->spio_ctrl->spio_available_blocks + - ((ctrl->spio_ctrl->spio_fill_counter - - SPIO_CREDITS_Counter(ctrl->spio_ctrl->spio_credits. - value)) & 0x7FF)) <= - ctrl->spio_total_blocks); - ctrl->spio_ctrl->spio_available_blocks = - ctrl->spio_total_blocks - - ((ctrl->spio_ctrl->spio_fill_counter - - SPIO_CREDITS_Counter(ctrl->spio_ctrl->spio_credits. - value)) & 0x7FF); - - /* a successful credit update, clear reset count */ - ctrl->spio_ctrl->spio_reset_count = 0; - } - - return PSM2_OK; -} - -/* - * return value: - * PSM2_OK: new credits updated; - * PSM2_OK_NO_PROGRESS: no new credits; - */ -static inline psm2_error_t -psm3_gen1_spio_credit_return_update_shared(struct psm3_gen1_spio *ctrl) -{ - uint64_t credit_return; - - pthread_spin_lock(&ctrl->spio_ctrl->spio_ctrl_lock); - - credit_return = *ctrl->spio_credits_addr; - /* Update available blocks based on fill counter and free counter */ - if (ctrl->spio_ctrl->spio_credits.credit_return == credit_return) { - pthread_spin_unlock(&ctrl->spio_ctrl->spio_ctrl_lock); - return PSM2_OK_NO_PROGRESS; - } - - ctrl->spio_ctrl->spio_credits.credit_return = credit_return; - - /* If Status is set, then send context is halted */ - if (SPIO_CREDITS_Status(ctrl->spio_ctrl->spio_credits.value)) { - /* - * In context sharing mode, if there is a subcontext - * process in PIO writing, we need to wait till the PIO - * writing is done. So we spin wait here. Other processes - * won't come here because for them, there is NO new - * credit return change (the first 'if' check in this - * routine). - */ - while (ctrl->spio_ctrl->spio_write_in_progress) { - pthread_spin_unlock(&ctrl->spio_ctrl->spio_ctrl_lock); - usleep(1000); - pthread_spin_lock(&ctrl->spio_ctrl->spio_ctrl_lock); - } - - psm3_gen1_spio_reset_context(ctrl); - } else { - /* - * OPA1 has 1M PIO buffer, but each context can have max 64K, - * which is 1K 64B blocks, so the distance between fill counter - * and credit return counter is no more than 1024; Both fill - * counter and credit return counter are 11 bits value, - * representing range [0, 2047]. - */ - psmi_assert((ctrl->spio_ctrl->spio_available_blocks + - ((ctrl->spio_ctrl->spio_fill_counter - - SPIO_CREDITS_Counter(ctrl->spio_ctrl->spio_credits. - value)) & 0x7FF)) <= - ctrl->spio_total_blocks); - ctrl->spio_ctrl->spio_available_blocks = - ctrl->spio_total_blocks - - ((ctrl->spio_ctrl->spio_fill_counter - - SPIO_CREDITS_Counter(ctrl->spio_ctrl->spio_credits. - value)) & 0x7FF); - - /* a successful credit update, clear reset count */ - ctrl->spio_ctrl->spio_reset_count = 0; - } - - pthread_spin_unlock(&ctrl->spio_ctrl->spio_ctrl_lock); - - return PSM2_OK; -} - -static inline int -psm3_gen1_ack_hfi_event(uint64_t ack_bits, psmi_hal_hw_context ctxt) -{ - hfp_gen1_pc_private *psm_hw_ctxt = ctxt; - struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; - uint64_t hfi1_ack_bits = 0; - int i; - - /* Decode from HAL event codes to hfi1_events */ - for (i = 0; i < sizeof(hfi1_events_map)/sizeof(hfi1_events_map[0]); i++) - { - if (ack_bits & hfi1_events_map[i].psmi_hal_hfi_event_bit) - hfi1_ack_bits |= - hfi1_events_map[i].hfi1_event_bit; - } - - return psm3_gen1_event_ack(ctrl, hfi1_ack_bits); -} - -/* - * Check and process events - * return value: - * PSM2_OK: normal events processing; - * PSM2_OK_NO_PROGRESS: no event is processed; - */ -PSMI_ALWAYS_INLINE(psm2_error_t -psm3_gen1_spio_process_events(const struct ptl *ptl_gen)) -{ - struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen; - struct psm3_gen1_spio *ctrl = ptl->proto.spioc; - uint64_t event_mask; - int rc = psmi_hal_get_hfi_event_bits(&event_mask,ctrl->context->psm_hw_ctxt); - - if (rc) - return PSM2_OK_NO_PROGRESS; - - /* - * If there is no event, try do credit return update - * to catch send context halt. - */ - if_pf(event_mask == 0) - return ctrl->spio_credit_return_update(ctrl); - - /* - * Process mmu invalidation event, this will invalidate - * all caching items removed by mmu notifier. - */ - if (event_mask & PSM_HAL_HFI_EVENT_TID_MMU_NOTIFY) { - /* - * driver will clear the event bit before return, - * PSM does not need to ack the event. - */ -// RNDV_MOD - do we need a similar callback, what triggers MMU NOTIFY -// driver does MMU NOTIFY in mmu_rb.c, seems to be kernel callback when -// pages in process go away? - return ips_tidcache_invalidation(&ptl->proto.protoexp->tidc); - } - - /* Check if HFI is frozen */ - if (event_mask & PSM_HAL_HFI_EVENT_FROZEN) { - /* if no progress, return and retry */ - if (ctrl->spio_reset_hfi(ctrl) != PSM2_OK) - return PSM2_OK_NO_PROGRESS; - } - - /* First ack the driver the receipt of the events */ - _HFI_VDBG("Acking event(s) 0x%" PRIx64 " to qib driver.\n", - (uint64_t) event_mask); - - psm3_gen1_ack_hfi_event(event_mask, ctrl->context->psm_hw_ctxt); - - if (event_mask & PSM_HAL_HFI_EVENT_LINKDOWN) { - /* A link down event can clear the LMC and SL2VL - * change as those events are implicitly handled - * in the link up/down event handler. - */ - event_mask &= - ~(PSM_HAL_HFI_EVENT_LMC_CHANGE | - PSM_HAL_HFI_EVENT_SL2VL_CHANGE); - psm3_gen1_ptl_ips_update_linkinfo(&((struct ptl_ips *)(ctrl->ptl))->proto); - _HFI_VDBG("Link down detected.\n"); - } - - if (event_mask & PSM_HAL_HFI_EVENT_LID_CHANGE) { - /* Display a warning that LID change has occurred during - * the run. This is not supported in the current - * implementation and in general is bad for the SM to - * re-assign LIDs during a run. - * We don't filter based on speed, just in case that changed too - */ - _HFI_INFO - ("Warning! LID change detected during run. " - "Old LID: %u, New Lid: %d\n", - psm3_epid_lid(ctrl->context->epid), - psm3_gen1_get_port_lid(ctrl->unit_id, - ctrl->portnum, 0 /*addr_index*/,GEN1_FILTER)); - } - - if (event_mask & PSM_HAL_HFI_EVENT_LMC_CHANGE) - _HFI_INFO("Fabric LMC changed.\n"); - - if (event_mask & PSM_HAL_HFI_EVENT_SL2VL_CHANGE) { - _HFI_INFO("SL2VL mapping changed for port.\n"); - psm3_gen1_ips_ptl_init_sl2sc_table(&((struct ptl_ips *)(ctrl->ptl))->proto); - } - - return PSM2_OK; -} - -static inline void -psm3_gen1_spio_handle_resync(struct psm3_gen1_spio *ctrl, uint64_t consecutive_send_failed) -{ - /* hfi_force_pio_avail_update(ctrl->context->ctrl); */ - - if (!(consecutive_send_failed & (SPIO_MAX_CONSECUTIVE_SEND_FAIL - 1))) - psm3_gen1_spio_handle_stall(ctrl, consecutive_send_failed); -} - -/* - * This function attempts to write a packet to a PIO. - * - * Recoverable errors: - * PSM2_OK: Packet triggered through PIO. - * PSM2_EP_NO_RESOURCES: No PIO bufs available or cable pulled. - * - * Unrecoverable errors: - * PSM2_EP_NO_NETWORK: No network, no lid, ... - * PSM2_EP_DEVICE_FAILURE: Chip failures, rxe/txe parity, etc. - */ -static inline psm2_error_t -psm3_gen1_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *flow, - struct ips_scb *scb, uint32_t *payload, - uint32_t length, uint32_t isCtrlMsg, - uint32_t cksum_valid, uint32_t cksum -#ifdef PSM_CUDA - , uint32_t is_cuda_payload -#endif - ) -{ - struct psm3_gen1_spio *ctrl = proto->spioc; - volatile struct psm3_gen1_spio_ctrl *spio_ctrl = ctrl->spio_ctrl; - volatile uint64_t *pioaddr; - uint32_t paylen, nblks; - psm2_error_t err = PSM2_OK; - int do_lock = psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED); - - psmi_assert(flow->transfer == PSM_TRANSFER_PIO); - PSMI_LOCK_ASSERT(proto->mq->progress_lock); - if (do_lock) - pthread_spin_lock(&ctrl->spio_lock); - -#ifdef PSM_FI - if_pf(PSM3_FAULTINJ_ENABLED()) { - PSM3_FAULTINJ_STATIC_DECL(fi_lost, "piosend", - "drop PIO packet before send", - 1, IPS_FAULTINJ_PIOLOST); - PSM3_FAULTINJ_STATIC_DECL(fi_busy, "piobusy", - "report PIO busy before send", - 1, IPS_FAULTINJ_PIOBUSY); - if (PSM3_FAULTINJ_IS_FAULT(fi_lost, proto->ep, "")) { - if (do_lock) - pthread_spin_unlock(&ctrl->spio_lock); - return PSM2_OK; - } else if_pf(PSM3_FAULTINJ_IS_FAULT(fi_busy, proto->ep, "")) - goto fi_busy; - /* else fall through normal processing path, i.e. no faults */ - } -#endif /* #ifdef PSM_FI */ - - psmi_assert((length & 0x3) == 0); - paylen = length + (cksum_valid ? PSM_CRC_SIZE_IN_BYTES : 0); - nblks = 1 + ((paylen + 63) >> 6); - - if (spio_ctrl->spio_available_blocks < nblks) { - ctrl->spio_credit_return_update(ctrl); - - if_pf(spio_ctrl->spio_available_blocks < nblks) { - /* Check unit status */ -#ifdef PSM_FI -fi_busy: -#endif /* #ifdef PSM_FI */ - if ((err = - psm3_gen1_context_check_hw_status(((struct ptl_ips *)(ctrl->ptl))->ep)) == - PSM2_OK) { - if (0 == - (++ctrl-> - spio_consecutive_failures & - (SPIO_RESYNC_CONSECUTIVE_SEND_FAIL - 1))) - psm3_gen1_spio_handle_resync(ctrl, - ctrl-> - spio_consecutive_failures); - err = PSM2_EP_NO_RESOURCES; - } - /* If cable is pulled, we don't count it as a consecutive failure, - * we just make it as though no send pio was available */ - else if (err == PSM2_OK_NO_PROGRESS) - err = PSM2_EP_NO_RESOURCES; - /* else something bad happened in check_status */ - if (do_lock) - pthread_spin_unlock(&ctrl->spio_lock); - return err; - } - } - - /* - * if context->spio_ctrl is set, it is pointing to shared context ureg - * page, and we are using context sharing. - */ - if (ctrl->context->spio_ctrl) { - pthread_spin_lock(&spio_ctrl->spio_ctrl_lock); - if (spio_ctrl->spio_available_blocks < nblks) { - pthread_spin_unlock(&spio_ctrl->spio_ctrl_lock); - - if (do_lock) - pthread_spin_unlock(&ctrl->spio_lock); - return PSM2_EP_NO_RESOURCES; - } - } - - _HFI_VDBG("credits: total %d, avail %d index %d, fill %d " - "free %d: %d %d %d %d %d; addr %llx\n", - ctrl->spio_total_blocks, - spio_ctrl->spio_available_blocks, - spio_ctrl->spio_block_index, - spio_ctrl->spio_fill_counter, - SPIO_CREDITS_Counter(spio_ctrl->spio_credits.value), - SPIO_CREDITS_Status(spio_ctrl->spio_credits.value), - SPIO_CREDITS_DueToPbc(spio_ctrl->spio_credits.value), - SPIO_CREDITS_DueToTheshold(spio_ctrl->spio_credits.value), - SPIO_CREDITS_DueToErr(spio_ctrl->spio_credits.value), - SPIO_CREDITS_DueToForce(spio_ctrl->spio_credits.value), - *ctrl->spio_credits_addr); - - /* - * Save the assigned locally, update the shared for other processes. - */ - ctrl->spio_block_index = spio_ctrl->spio_block_index; - spio_ctrl->spio_available_blocks -= nblks; - /* fill counter should be 11 bits value, same as credit return counter */ - spio_ctrl->spio_fill_counter = - (spio_ctrl->spio_fill_counter + nblks) & 0x7FF; - spio_ctrl->spio_block_index += nblks; - if (spio_ctrl->spio_block_index >= ctrl->spio_total_blocks) - spio_ctrl->spio_block_index -= ctrl->spio_total_blocks; - - /* - * Unlock in context sharing mode, but increase refcount to - * indicate I am in progress to write to PIO blocks. - */ - if (ctrl->context->spio_ctrl) { - spio_ctrl->spio_write_in_progress++; - pthread_spin_unlock(&spio_ctrl->spio_ctrl_lock); - } - - ctrl->spio_num_stall = 0; /* now able to send, so clear if set */ - ctrl->spio_consecutive_failures = 0; - if (do_lock) - pthread_spin_unlock(&ctrl->spio_lock); - - _HFI_VDBG("PIO write: nblks %d length %d, paylen %d\n", nblks, length, - paylen); - - /* Setup PBC for this packet */ - psm3_gen1_pbc_update(proto, flow, isCtrlMsg, - &scb->pbc, sizeof(struct ips_message_header), paylen); - - /* Write to PIO: SOP block */ - pioaddr = ctrl->spio_bufbase_sop + ctrl->spio_block_index * 8; - if (++ctrl->spio_block_index == ctrl->spio_total_blocks) - ctrl->spio_block_index = 0; - - ctrl->spio_blockcpy_med(pioaddr, (uint64_t *) &scb->pbc, 1); - _HFI_VDBG("pio qw write sop %p: 8\n", pioaddr); - - /* Write to PIO: other blocks of payload */ -#ifdef PSM_CUDA - if (is_cuda_payload) { - if (ctrl->cuda_pio_buffer == NULL) { - PSM3_GPU_HOST_ALLOC(&ctrl->cuda_pio_buffer, MAX_CUDA_MTU); - } - /* Since the implementation of cuMemcpy is unknown, - and the HFI specifies several conditions for how PIO - writes must occur, for safety reasons we should not assume - that cuMemcpy will follow the HFI's requirements. - The cuMemcpy should instead write into a buffer in - host memory, and then PSM can copy to the HFI as usual. */ - PSM3_GPU_MEMCPY_DTOH(ctrl->cuda_pio_buffer, - payload, paylen); - payload = (uint32_t *) ctrl->cuda_pio_buffer; - } -#endif - if (length >= 64) { - - psm3_gen1_spio_blockcpy_fn_t blockcpy_fn; - if (length >= 256) { - blockcpy_fn = ctrl->spio_blockcpy_large; - } - else { - blockcpy_fn = ctrl->spio_blockcpy_med; - } - - uint32_t blks2send = length >> 6; - uint32_t blks2end = - ctrl->spio_total_blocks - ctrl->spio_block_index; - - pioaddr = ctrl->spio_bufbase + ctrl->spio_block_index * 8; - if (blks2end >= blks2send) { - blockcpy_fn(pioaddr, - (uint64_t *)payload, blks2send); - _HFI_VDBG("pio blk write %p: %d\n", - pioaddr, blks2send); - ctrl->spio_block_index += blks2send; - if (ctrl->spio_block_index == ctrl->spio_total_blocks) - ctrl->spio_block_index = 0; - payload += blks2send*16; - } else { - blockcpy_fn(pioaddr, - (uint64_t *)payload, blks2end); - _HFI_VDBG("pio blk write %p: %d\n", - pioaddr, blks2end); - payload += blks2end*16; - - pioaddr = ctrl->spio_bufbase; - blockcpy_fn(pioaddr, - (uint64_t *)payload, (blks2send-blks2end)); - _HFI_VDBG("pio blk write %p: %d\n", - pioaddr, (blks2send-blks2end)); - ctrl->spio_block_index = blks2send - blks2end; - payload += (blks2send-blks2end)*16; - } - - length -= blks2send*64; - } - - /* - * The following code makes sure to write to pioaddr in - * qword granularity, this is required by hardware. - */ - paylen = length + (cksum_valid ? PSM_CRC_SIZE_IN_BYTES : 0); - if (paylen > 0) { - uint32_t blkbuf[32]; - uint32_t qws = length >> 3; - uint32_t dws = 0; - - pioaddr = ctrl->spio_bufbase + ctrl->spio_block_index * 8; - if (++ctrl->spio_block_index == ctrl->spio_total_blocks) - ctrl->spio_block_index = 0; - - /* Write the remaining qwords of payload */ - if (qws) { - psm3_qwordcpy_safe(pioaddr, (uint64_t *) payload, qws); - _HFI_VDBG("pio qw write %p: %d\n", pioaddr, qws); - payload += qws << 1; - length -= qws << 3; - - pioaddr += qws; - paylen -= qws << 3; - } - - /* if we have last one dword payload */ - if (length > 0) { - blkbuf[dws++] = payload[0]; - } - /* if we have checksum to attach */ - if (paylen > length) { - blkbuf[dws++] = cksum; - blkbuf[dws++] = cksum; - } - - /* Write the rest of qwords of current block */ - psm3_qwordcpy_safe(pioaddr, (uint64_t *) blkbuf, 8 - qws); - _HFI_VDBG("pio qw write %p: %d\n", pioaddr, 8 - qws); - - if (paylen > ((8 - qws) << 3)) { - /* We need another block */ - pioaddr = - ctrl->spio_bufbase + ctrl->spio_block_index * 8; - if (++ctrl->spio_block_index == ctrl->spio_total_blocks) - ctrl->spio_block_index = 0; - - /* Write the last block */ - psm3_qwordcpy_safe(pioaddr, - (uint64_t *) &blkbuf[(8 - qws) << 1], - 8); - _HFI_VDBG("pio qw write %p: %d\n", pioaddr, 8); - } - } - /* - * In context sharing, we need to track who is in progress of - * writing to PIO block, this is for halted send context reset. - * I am done with PIO blocks writing, decrease the refcount. - */ - if (ctrl->context->spio_ctrl) { - pthread_spin_lock(&spio_ctrl->spio_ctrl_lock); - spio_ctrl->spio_write_in_progress--; - pthread_spin_unlock(&spio_ctrl->spio_ctrl_lock); - } - - return err; -} /* psm3_gen1_spio_transfer_frame() */ -#endif /* PSM_OPA */ -#endif /* _GEN1_SPIO_C_ */ diff --git a/psm3/hal_gen1/gen1_spio.h b/psm3/hal_gen1/gen1_spio.h deleted file mode 100644 index b72db49..0000000 --- a/psm3/hal_gen1/gen1_spio.h +++ /dev/null @@ -1,155 +0,0 @@ -#ifdef PSM_OPA -/* - - This file is provided under a dual BSD/GPLv2 license. When using or - redistributing this file, you may do so under either license. - - GPL LICENSE SUMMARY - - Copyright(c) 2017 Intel Corporation. - - This program is free software; you can redistribute it and/or modify - it under the terms of version 2 of the GNU General Public License as - published by the Free Software Foundation. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - Contact Information: - Intel Corporation, www.intel.com - - BSD LICENSE - - Copyright(c) 2017 Intel Corporation. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -/* Copyright (c) 2003-2017 Intel Corporation. All rights reserved. */ - -#ifndef PSM_HAL_GEN1_SPIO_H -#define PSM_HAL_GEN1_SPIO_H - -#include "psm_user.h" - -#define IPS_CTXT_RESET_MAX 1000 /* max send context reset */ -struct psm3_gen1_spio; -struct ptl; -struct ips_proto; -struct ips_flow; - -typedef -void (*psm3_gen1_spio_blockcpy_fn_t)(volatile uint64_t *dest, - const uint64_t *src, uint32_t nblock); - -#define SPIO_CREDITS_Counter(value) (((value) >> 0) & 0x7FF) -#define SPIO_CREDITS_Status(value) (((value) >> 11) & 0x1) -#define SPIO_CREDITS_DueToPbc(value) (((value) >> 12) & 0x1) -#define SPIO_CREDITS_DueToTheshold(value) (((value) >> 13) & 0x1) -#define SPIO_CREDITS_DueToErr(value) (((value) >> 14) & 0x1) -#define SPIO_CREDITS_DueToForce(value) (((value) >> 15) & 0x1) -struct psm3_gen1_spio_credits { -/* don't use bit operation for performance reason, - * using above macro instead. - uint16_t Counter:11; - uint16_t Status:1; - uint16_t CreditReturnDueToPbc:1; - uint16_t CreditReturnDueToThreshold:1; - uint16_t CreditReturnDueToErr:1; - uint16_t CreditReturnDueToForce:1; -*/ - union { - struct { - uint16_t value; - uint16_t pad0; - uint32_t pad1; - }; - uint64_t credit_return; - }; -}; - -struct psm3_gen1_spio_ctrl { - /* credit return lock for context sharing */ - pthread_spinlock_t spio_ctrl_lock; - - /* PIO write in progress for context sharing */ - volatile uint16_t spio_write_in_progress; - /* send context reset count */ - volatile uint16_t spio_reset_count; - /* HFI frozen count, shared copy */ - volatile uint16_t spio_frozen_count; - - volatile uint16_t spio_available_blocks; - volatile uint16_t spio_block_index; - volatile uint16_t spio_fill_counter; - volatile struct psm3_gen1_spio_credits spio_credits; -} __attribute__ ((aligned(64))); - -struct psm3_gen1_spio { - const psmi_context_t *context; - struct ptl *ptl; - uint16_t unit_id; - uint16_t portnum; - - pthread_spinlock_t spio_lock; /* thread lock */ - volatile __le64 *spio_credits_addr __attribute__ ((aligned(64))); - volatile uint64_t *spio_bufbase_sop; - volatile uint64_t *spio_bufbase; - volatile struct psm3_gen1_spio_ctrl *spio_ctrl; - - uint16_t spio_frozen_count; /* local copy */ - uint16_t spio_total_blocks; - uint16_t spio_block_index; - - uint32_t spio_consecutive_failures; - uint64_t spio_num_stall; - uint64_t spio_num_stall_total; - uint64_t spio_next_stall_warning; - uint64_t spio_last_stall_cyc; - uint64_t spio_init_cyc; - - psm2_error_t (*spio_reset_hfi)(struct psm3_gen1_spio *ctrl); - psm2_error_t (*spio_credit_return_update)(struct psm3_gen1_spio *ctrl); - - /* copying routines based on block size */ - psm3_gen1_spio_blockcpy_fn_t spio_blockcpy_med; - psm3_gen1_spio_blockcpy_fn_t spio_blockcpy_large; - -#ifdef PSM_CUDA - /* Use an intermediate buffer when writing PIO data from the - GPU to ensure that we follow the HFI's write ordering rules. */ - unsigned char *cuda_pio_buffer; - -#define MAX_CUDA_MTU 10240 -#endif -}; - -#endif /* PSM_HAL_GEN1_SPIO_H */ -#endif /* PSM_OPA */ diff --git a/psm3/hal_gen1/gen1_types.h b/psm3/hal_gen1/gen1_types.h deleted file mode 100644 index 7323691..0000000 --- a/psm3/hal_gen1/gen1_types.h +++ /dev/null @@ -1,244 +0,0 @@ -#ifdef PSM_OPA -/* - - This file is provided under a dual BSD/GPLv2 license. When using or - redistributing this file, you may do so under either license. - - GPL LICENSE SUMMARY - - Copyright(c) 2015 Intel Corporation. - - This program is free software; you can redistribute it and/or modify - it under the terms of version 2 of the GNU General Public License as - published by the Free Software Foundation. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - Contact Information: - Intel Corporation, www.intel.com - - BSD LICENSE - - Copyright(c) 2015 Intel Corporation. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ - -#ifndef PSM_HAL_GEN1_TYPES_H -#define PSM_HAL_GEN1_TYPES_H - -/* some basic datatypes used throughout the gen1 HAL */ - -#define LAST_RHF_SEQNO 13 - -/* HAL assumes that the rx hdr q and the egr buff q are circular lists - with two important indexes: - - head - software takes from this side of the circular list - tail - hardware deposits new content here - -The indexes advance in the list 0, 1, 2, 3, ... until they reach the value: -(number_of_entries_in_the_q-1), then the next value they take is 0. And, -so, that is why these are called circular lists. - -When the head idx == tail idx, that represents an empty circular list. - -A completely full circular list is when: - - head_idx == (tail_idx + 1) % number_of_entries_in_the_q - -Both indexes will always be in the range: 0 <= index < number_of_entries_in_the_q - -After software receives the packet in the slot corresponding to the head idx, -and processes it completely, software will signal to the hardware that the slot -is available for re-use by retiring it - see api below for details. - -Note that these are simplified assumptions for the benefit of the hardware independent -layer of PSM. The actual implementation details are hidden in the hal instances. - -Note that subcontexts have a collection of head / tail indexes for their use. - -So, HAL supports the use of the following circular lists dealing with the -following entities: - -1. Rx Hdr q - corresponding to hardware (software modifies head index, hardware modifies tail index). -2. Rx egr q - corresponding to hardware (software modifies head index, hardware modifies tail index). -3. Rx Hdr q - corresponding to a subcontext (software modifies both head and tail indexes). -4. Rx egr q - corresponding to a subcontext (software modifies both head and tail indexes). - -Declare a type to indicate a circular list index: -*/ -typedef uint32_t psm3_gen1_cl_idx; - -typedef enum -{ - PSM3_GEN1_CL_Q_RX_HDR_Q = 0, /* HW context for the rx hdr q. */ - PSM3_GEN1_CL_Q_RX_EGR_Q = 1, /* HW context for the rx eager q. */ - /* Start of subcontexts (This is subcontext 0) */ - PSM3_GEN1_CL_Q_RX_HDR_Q_SC_0 = 2, /* Subcontext 0's rx hdr q. */ - PSM3_GEN1_CL_Q_RX_EGR_Q_SC_0 = 3, /* Subcontext 0's rx eager q. */ - - /* Following SC 0's CL_Q's are the circular list q for subcontexts 1-7, - two per subcontext. Even values are the rx hdr q for the subcontext - Odd value are for the eager q. */ - -/* Given a subcontext number (0-7), return the CL_Q for the RX HDR_Q: */ -#define PSM3_GEN1_GET_SC_CL_Q_RX_HDR_Q(SC) ((SC)*2 + PSM3_GEN1_CL_Q_RX_HDR_Q_SC_0) -/* Given a subcontext number (0-7), return the CL_Q for the RX EGR_Q: */ -#define PSM3_GEN1_GET_SC_CL_Q_RX_EGR_Q(SC) ((SC)*2 + PSM3_GEN1_CL_Q_RX_EGR_Q_SC_0) -} psm3_gen1_cl_q; - -typedef struct -{ - volatile uint64_t *cl_q_head; - volatile uint64_t *cl_q_tail; - union - { - /* hdr_qe's are only present in *_RX_HDR_Q* CL Q types: */ - struct - { - uint32_t rx_hdrq_rhf_seq; - uint32_t *p_rx_hdrq_rhf_seq; - uint32_t *hdrq_base_addr; - } hdr_qe; /* header queue entry */ - /* egr_buffs's are only present in *_RX_EGR_Q* CL Q types: */ - void **egr_buffs; - }; -} psm3_gen1_cl_q_t; - -typedef uint64_t psm3_gen1_raw_rhf_t; - -typedef struct psm3_gen1_rhf_ -{ - /* The first entity in rhf is the decomposed rhf. - psm3_gen1_get_receive_event(), we decompose the raw rhf - obtained from the hardware and deposit the data into this common - decomposed rhf, so the upper layers of psm can find the data in one - uniform place. */ - - uint64_t decomposed_rhf; - - /* The second entry is the raw rhf that comes from the h/w. - The upper layers of psm should not use the raw rhf, instead use the - decomposed rhf above. The raw rhf is intended for use by the HAL - instance only. */ - uint64_t raw_rhf; -} psm3_gen1_rhf_t; - -#define PSM3_GEN1_RHF_ERR_ICRC_NBITS 1 -#define PSM3_GEN1_RHF_ERR_ICRC_SHFTC 63 -#define PSM3_GEN1_RHF_ERR_RSRV_NBITS 1 -#define PSM3_GEN1_RHF_ERR_RSRV_SHFTC 62 -#define PSM3_GEN1_RHF_ERR_ECC_NBITS 1 -#define PSM3_GEN1_RHF_ERR_ECC_SHFTC 61 -#define PSM3_GEN1_RHF_ERR_LEN_NBITS 1 -#define PSM3_GEN1_RHF_ERR_LEN_SHFTC 60 -#define PSM3_GEN1_RHF_ERR_TID_NBITS 1 -#define PSM3_GEN1_RHF_ERR_TID_SHFTC 59 -#define PSM3_GEN1_RHF_ERR_TFGEN_NBITS 1 -#define PSM3_GEN1_RHF_ERR_TFGEN_SHFTC 58 -#define PSM3_GEN1_RHF_ERR_TFSEQ_NBITS 1 -#define PSM3_GEN1_RHF_ERR_TFSEQ_SHFTC 57 -#define PSM3_GEN1_RHF_ERR_RTE_NBITS 3 -#define PSM3_GEN1_RHF_ERR_RTE_SHFTC 56 -#define PSM3_GEN1_RHF_ERR_DC_NBITS 1 -#define PSM3_GEN1_RHF_ERR_DC_SHFTC 55 -#define PSM3_GEN1_RHF_ERR_DCUN_NBITS 1 -#define PSM3_GEN1_RHF_ERR_DCUN_SHFTC 54 -#define PSM3_GEN1_RHF_ERR_KHDRLEN_NBITS 1 -#define PSM3_GEN1_RHF_ERR_KHDRLEN_SHFTC 53 -#define PSM3_GEN1_RHF_ALL_ERR_FLAGS_NBITS (PSM3_GEN1_RHF_ERR_ICRC_NBITS + PSM3_GEN1_RHF_ERR_RSRV_NBITS \ - + PSM3_GEN1_RHF_ERR_ECC_NBITS \ - + PSM3_GEN1_RHF_ERR_LEN_NBITS + PSM3_GEN1_RHF_ERR_TID_NBITS \ - + PSM3_GEN1_RHF_ERR_TFGEN_NBITS + PSM3_GEN1_RHF_ERR_TFSEQ_NBITS \ - + PSM3_GEN1_RHF_ERR_RTE_NBITS + PSM3_GEN1_RHF_ERR_DC_NBITS \ - + PSM3_GEN1_RHF_ERR_DCUN_NBITS + PSM3_GEN1_RHF_ERR_KHDRLEN_NBITS) -#define PSM3_GEN1_RHF_ALL_ERR_FLAGS_SHFTC 53 -#define PSM3_GEN1_RHF_EGR_BUFF_OFF_NBITS 12 -#define PSM3_GEN1_RHF_EGR_BUFF_OFF_SHFTC 32 -#define PSM3_GEN1_RHF_SEQ_NBITS 4 -#define PSM3_GEN1_RHF_SEQ_SHFTC 28 -#define PSM3_GEN1_RHF_EGR_BUFF_IDX_NBITS 11 -#define PSM3_GEN1_RHF_EGR_BUFF_IDX_SHFTC 16 -#define PSM3_GEN1_RHF_USE_EGR_BUFF_NBITS 1 -#define PSM3_GEN1_RHF_USE_EGR_BUFF_SHFTC 15 -#define PSM3_GEN1_RHF_RX_TYPE_NBITS 3 -#define PSM3_GEN1_RHF_RX_TYPE_SHFTC 12 -#define PSM3_GEN1_RHF_PKT_LEN_NBITS 12 -#define PSM3_GEN1_RHF_PKT_LEN_SHFTC 0 - -typedef enum { - PSM3_GEN1_RHF_RX_TYPE_EXPECTED = 0, - PSM3_GEN1_RHF_RX_TYPE_EAGER = 1, - PSM3_GEN1_RHF_RX_TYPE_NON_KD = 2, - PSM3_GEN1_RHF_RX_TYPE_ERROR = 3 -} psm3_gen1_rhf_rx_type; - -#define PSM3_GEN1_RHF_UNPACK(A,NAME) ((uint32_t)((A.decomposed_rhf >> \ - PSM3_GEN1_RHF_ ## NAME ## _SHFTC \ - ) & PSMI_NBITS_TO_MASK( \ - PSM3_GEN1_RHF_ ## NAME ## _NBITS))) -/* define constants for the decomposed rhf error masks. - Note how each of these are shifted by the ALL_ERR_FLAGS shift count. */ - -#define PSM3_GEN1_RHF_ERR_MASK_64(NAME) ((uint64_t)(((PSMI_NBITS_TO_MASK( \ - PSM3_GEN1_RHF_ERR_ ## NAME ## _NBITS) << \ - PSM3_GEN1_RHF_ERR_ ## NAME ## _SHFTC )))) -#define PSM3_GEN1_RHF_ERR_MASK_32(NAME) ((uint32_t)(PSM3_GEN1_RHF_ERR_MASK_64(NAME) >> \ - PSM3_GEN1_RHF_ALL_ERR_FLAGS_SHFTC)) -#define PSM3_GEN1_RHF_ERR_ICRC PSM3_GEN1_RHF_ERR_MASK_32(ICRC) -#define PSM3_GEN1_RHF_ERR_ECC PSM3_GEN1_RHF_ERR_MASK_32(ECC) -#define PSM3_GEN1_RHF_ERR_LEN PSM3_GEN1_RHF_ERR_MASK_32(LEN) -#define PSM3_GEN1_RHF_ERR_TID PSM3_GEN1_RHF_ERR_MASK_32(TID) -#define PSM3_GEN1_RHF_ERR_TFGEN PSM3_GEN1_RHF_ERR_MASK_32(TFGEN) -#define PSM3_GEN1_RHF_ERR_TFSEQ PSM3_GEN1_RHF_ERR_MASK_32(TFSEQ) -#define PSM3_GEN1_RHF_ERR_RTE PSM3_GEN1_RHF_ERR_MASK_32(RTE) -#define PSM3_GEN1_RHF_ERR_DC PSM3_GEN1_RHF_ERR_MASK_32(DC) -#define PSM3_GEN1_RHF_ERR_DCUN PSM3_GEN1_RHF_ERR_MASK_32(DCUN) -#define PSM3_GEN1_RHF_ERR_KHDRLEN PSM3_GEN1_RHF_ERR_MASK_32(KHDRLEN) - -#define psm3_gen1_rhf_get_use_egr_buff(A) PSM3_GEN1_RHF_UNPACK(A,USE_EGR_BUFF) -#define psm3_gen1_rhf_get_egr_buff_index(A) PSM3_GEN1_RHF_UNPACK(A,EGR_BUFF_IDX) -#define psm3_gen1_rhf_get_egr_buff_offset(A) PSM3_GEN1_RHF_UNPACK(A,EGR_BUFF_OFF) -#define psm3_gen1_rhf_get_packet_length(A) (PSM3_GEN1_RHF_UNPACK(A,PKT_LEN)<<2) -#define psm3_gen1_rhf_get_all_err_flags(A) PSM3_GEN1_RHF_UNPACK(A,ALL_ERR_FLAGS) -#define psm3_gen1_rhf_get_seq(A) PSM3_GEN1_RHF_UNPACK(A,SEQ) - -#define psm3_gen1_rhf_get_rx_type(A) PSM3_GEN1_RHF_UNPACK(A,RX_TYPE) -#define PSM3_GEN1_RHF_PACK(NAME,VALUE) ((uint64_t)((((uint64_t)(VALUE)) & \ - PSMI_NBITS_TO_MASK( \ - PSM3_GEN1_RHF_ ## NAME ## _NBITS \ - )) << ( \ - PSM3_GEN1_RHF_ ## NAME ## _SHFTC ))) -#endif /* PSM_HAL_GEN1_TYPES_H */ -#endif /* PSM_OPA */ diff --git a/psm3/hal_gen1/gen1_user.h b/psm3/hal_gen1/gen1_user.h deleted file mode 100644 index f6f682b..0000000 --- a/psm3/hal_gen1/gen1_user.h +++ /dev/null @@ -1,672 +0,0 @@ -#ifdef PSM_OPA -/* - - This file is provided under a dual BSD/GPLv2 license. When using or - redistributing this file, you may do so under either license. - - GPL LICENSE SUMMARY - - Copyright(c) 2015 Intel Corporation. - - This program is free software; you can redistribute it and/or modify - it under the terms of version 2 of the GNU General Public License as - published by the Free Software Foundation. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - Contact Information: - Intel Corporation, www.intel.com - - BSD LICENSE - - Copyright(c) 2015 Intel Corporation. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#ifndef PSM_HAL_GEN1_USER_H -#define PSM_HAL_GEN1_USER_H - -/* This file contains all of the data structures and routines that are - publicly visible and usable (to low level infrastructure code; it is - not expected that any application, or even normal application-level library, - will ever need to use any of this). - - Additional entry points and data structures that are used by these routines - may be referenced in this file, but they should not be generally available; - they are visible here only to allow use in inlined functions. Any variable, - data structure, or function that starts with a leading "_" is in this - category. -*/ - -/* Include header files we need that are unlikely to otherwise be needed by */ -/* programs. */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "utils_user.h" -#include "gen1_types.h" -#include "gen1_common.h" -#include "gen1_service.h" - -#define HFI_RHF_USE_EGRBFR_MASK 0x1 -#define HFI_RHF_USE_EGRBFR_SHIFT 15 -#define HFI_RHF_EGRBFR_INDEX_MASK 0x7FF -#define HFI_RHF_EGRBFR_INDEX_SHIFT 16 - -#define HFI_RHF_SEQ_MASK 0xF -#define HFI_RHF_SEQ_SHIFT 28 -#define HFI_RHF_EGRBFR_OFFSET_MASK 0xFFF -#define HFI_RHF_EGRBFR_OFFSET_SHIFT 0 -#define HFI_RHF_HDRQ_OFFSET_MASK 0x1FF -#define HFI_RHF_HDRQ_OFFSET_SHIFT 12 -#define HFI_RHF_TIDERR 0x08000000 - -/* TidFlow related bits */ -#define HFI_TF_SEQNUM_SHIFT 0 -#define HFI_TF_SEQNUM_MASK 0x7ff - -#define HFI_TF_GENVAL_SHIFT 11 -#define HFI_TF_GENVAL_MASK 0xfffff - -#define HFI_TF_FLOWVALID_SHIFT 32 -#define HFI_TF_FLOWVALID_MASK 0x1 -#define HFI_TF_HDRSUPP_ENABLED_SHIFT 33 -#define HFI_TF_HDRSUPP_ENABLED_MASK 0x1 - -#define HFI_TF_KEEP_AFTER_SEQERR_SHIFT 34 -#define HFI_TF_KEEP_AFTER_SEQERR_MASK 0x1 -#define HFI_TF_KEEP_ON_GENERR_SHIFT 35 -#define HFI_TF_KEEP_ON_GENERR_MASK 0x1 -#define HFI_TF_KEEP_PAYLOAD_ON_GENERR_SHIFT 36 -#define HFI_TF_KEEP_PAYLOAD_ON_GENERR_MASK 0x1 -#define HFI_TF_STATUS_SEQMISMATCH_SHIFT 37 -#define HFI_TF_STATUS_SEQMISMATCH_MASK 0x1 -#define HFI_TF_STATUS_GENMISMATCH_SHIFT 38 -#define HFI_TF_STATUS_GENMISMATCH_MASK 0x1 - -/* PBC bits */ -#define HFI_PBC_STATICRCC_SHIFT 0 -#define HFI_PBC_STATICRCC_MASK 0xffff - -#define HFI_PBC_SC4_SHIFT 4 -#define HFI_PBC_SC4_MASK 0x1 - -#define HFI_PBC_INTR_SHIFT 31 -#define HFI_PBC_DCINFO_SHIFT 30 -#define HFI_PBC_TESTEBP_SHIFT 29 -#define HFI_PBC_PACKETBYPASS_SHIFT 28 -#define HFI_PBC_INSERTHCRC_SHIFT 26 -#define HFI_PBC_INSERTHCRC_MASK 0x3 -#define HFI_PBC_CREDITRETURN_SHIFT 25 -#define HFI_PBC_INSERTBYPASSICRC_SHIFT 24 -#define HFI_PBC_TESTBADICRC_SHIFT 23 -#define HFI_PBC_FECN_SHIFT 22 -#define HFI_PBC_VL_SHIFT 12 -#define HFI_PBC_VL_MASK 0xf -#define HFI_PBC_LENGTHDWS_SHIFT 0 -#define HFI_PBC_LENGTHDWS_MASK 0xfff - -/* this portion only defines what we currently use */ -struct hfi_pbc { - __le32 pbc0; - __le16 PbcStaticRateControlCnt; - __le16 fill1; -}; - -typedef enum mapsize -{ SC_CREDITS, - PIO_BUFBASE_SOP, - PIO_BUFBASE, - RCVHDR_BUFBASE, - RCVEGR_BUFBASE, - SDMA_COMP_BUFBASE, - USER_REGBASE, - RCVHDRTAIL_BASE, - EVENTS_BUFBASE, - STATUS_BUFBASE, - SUBCTXT_UREGBASE, - SUBCTXT_RCVHDRBUF, - SUBCTXT_RCVEGRBUF, - MAPSIZE_MAX -} mapsize_t; - -/* TODO: consider casting in the ALIGN() macro */ -#define ALIGN(x, a) (((x)+(a)-1)&~((a)-1)) -#define ALIGNDOWN_PTR(x, a) ((void*)(((uintptr_t)(x))&~((uintptr_t)((a)-1)))) - -/* using the same flags for all the mappings */ -#define HFI_MMAP_FLAGS (MAP_SHARED|MAP_LOCKED) -#define HFI_MMAP_PGSIZE sysconf(_SC_PAGESIZE) -/* cast to uintptr_t as opposed to intptr_t which evaluates to a signed type - * * on which one should not perform bitwise operations (undefined behavior) - * */ -#define HFI_MMAP_PGMASK (~(uintptr_t)(HFI_MMAP_PGSIZE-1)) - -/* this is only an auxiliary macro for HFI_MMAP_ERRCHECK() - * @off expected to be unsigned in order to AND with the page mask and avoid undefined behavior - */ -#define U64_TO_OFF64_PGMASK(off) ((__off64_t)((off) & HFI_MMAP_PGMASK)) - -#define HFI_MMAP_ALIGNOFF(fd, off, size, prot) psm3_gen1_mmap64(0,(size),(prot),HFI_MMAP_FLAGS,(fd),U64_TO_OFF64_PGMASK((off))) -/* complementary */ -#define HFI_MUNMAP(addr, size) munmap((addr), (size)) - -/* make sure uintmax_t can hold the result of unsigned int multiplication */ -#if UINT_MAX > (UINTMAX_MAX / UINT_MAX) -#error We cannot safely multiply unsigned integers on this platform -#endif - -/* @member assumed to be of type u64 and validated to be so */ -#define HFI_MMAP_ERRCHECK(fd, binfo, member, size, prot) ({ \ - typeof((binfo)->member) *__tptr = (__u64 *)NULL; \ - (void)__tptr; \ - void *__maddr = HFI_MMAP_ALIGNOFF((fd), (binfo)->member, (size), (prot)); \ - do { \ - if (unlikely(__maddr == MAP_FAILED)) { \ - uintmax_t outval = (uintmax_t)((binfo)->member); \ - _HFI_INFO("mmap of " #member " (0x%jx) size %zu failed: %s\n", \ - outval, size, strerror(errno)); \ - goto err_mmap_##member; \ - } \ - (binfo)->member = (__u64)__maddr; \ - _HFI_VDBG(#member "mmap %jx successful\n", (uintmax_t)((binfo)->member)); \ - } while(0); \ - __maddr; \ -}) - -/* assigns 0 to the member after unmapping */ -#define HFI_MUNMAP_ERRCHECK(binfo, member, size) \ - do { typeof((binfo)->member) *__tptr = (__u64 *)NULL; \ - (void)__tptr; \ - void *__addr = ALIGNDOWN_PTR((binfo)->member, HFI_MMAP_PGSIZE); \ - if (unlikely( __addr == NULL || (munmap(__addr, (size)) == -1))) { \ - _HFI_INFO("unmap of " #member " (%p) failed: %s\n", \ - __addr, strerror(errno)); \ - } \ - else { \ - _HFI_VDBG("unmap of " #member "(%p) succeeded\n", __addr); \ - (binfo)->member = 0; \ - } \ - } while(0) - -#define HFI_PCB_SIZE_IN_BYTES 8 - -/* Usable bytes in header (hdrsize - lrh - bth) */ -#define HFI_MESSAGE_HDR_SIZE_HFI (HFI_MESSAGE_HDR_SIZE-20) - -/* - * SDMA includes 8B sdma hdr, 8B PBC, and message header. - * If we are using GPU workloads, we need to set a new - * "flags" member which takes another 2 bytes in the - * sdma hdr. We let the driver know of this 2 extra bytes - * at runtime when we set the length for the iovecs. - */ -#define HFI_SDMA_HDR_SIZE (8+8+56) - -static inline __u32 psm3_gen1_hdrget_seq(const __le32 *rbuf) -{ - return (__le32_to_cpu(rbuf[0]) >> HFI_RHF_SEQ_SHIFT) - & HFI_RHF_SEQ_MASK; -} - -static inline __u32 psm3_gen1_hdrget_hdrq_offset(const __le32 *rbuf) -{ - return (__le32_to_cpu(rbuf[1]) >> HFI_RHF_HDRQ_OFFSET_SHIFT) - & HFI_RHF_HDRQ_OFFSET_MASK; -} - -struct _hfi_ctrl { - int32_t fd; /* device file descriptor */ - /* tidflow valid */ - uint32_t __hfi_tfvalid; - /* unit id */ - uint32_t __hfi_unit; - /* port id */ - uint32_t __hfi_port; - - /* number of eager tid entries */ - uint32_t __hfi_tidegrcnt; - /* number of expected tid entries */ - uint32_t __hfi_tidexpcnt; - - /* effective mtu size, should be <= base_info.mtu */ - uint32_t __hfi_mtusize; - /* max PIO size, should be <= effective mtu size */ - uint32_t __hfi_piosize; - - /* two struct output from driver. */ - struct hfi1_ctxt_info ctxt_info; - struct hfi1_base_info base_info; - - /* some local storages in some condition: */ - /* as storage of __hfi_rcvtidflow in psm3_gen1_userinit_internal(). */ - __le64 regs[HFI_TF_NFLOWS]; - - /* location to which OPA writes the rcvhdrtail register whenever - it changes, so that no chip registers are read in the performance - path. */ - volatile __le64 *__hfi_rcvtail; - - /* address where ur_rcvhdrtail is written */ - volatile __le64 *__hfi_rcvhdrtail; - /* address where ur_rcvhdrhead is written */ - volatile __le64 *__hfi_rcvhdrhead; - /* address where ur_rcvegrindextail is read */ - volatile __le64 *__hfi_rcvegrtail; - /* address where ur_rcvegrindexhead is written */ - volatile __le64 *__hfi_rcvegrhead; - /* address where ur_rcvegroffsettail is read */ - volatile __le64 *__hfi_rcvofftail; - /* address where ur_rcvtidflow is written */ - volatile __le64 *__hfi_rcvtidflow; -}; - -/* After the device is opened, psm3_gen1_userinit() is called to give the driver the - parameters the user code wants to use, and to get the implementation values, - etc. back. 0 is returned on success, a positive value is a standard errno, - and a negative value is reserved for future use. The first argument is - the filedescriptor returned by the device open. - - It is allowed to have multiple devices (and of different types) - simultaneously opened and initialized, although this won't be fully - implemented initially. This routine is used by the low level - hfi protocol code (and any other code that has similar low level - functionality). - This is the only routine that takes a file descriptor, rather than an - struct _hfi_ctrl *. The struct _hfi_ctrl * used for everything - else is returned by this routine. -*/ -struct _hfi_ctrl *psm3_gen1_userinit(int32_t, struct hfi1_user_info_dep *); - -/* Internal function extends API, while original remains for backwards - compatibility with external code -*/ -struct _hfi_ctrl *psm3_gen1_userinit_internal(int32_t, bool, struct hfi1_user_info_dep *); - -/* don't inline these; it's all init code, and not inlining makes the */ -/* overall code shorter and easier to debug */ -void psm3_gen1_touch_mmap(void *, size_t) __attribute__ ((noinline)); - -/* set the BTH pkey to check for this process. */ -/* This is for receive checks, not for sends. It isn't necessary - to set the default key, that's always allowed by the hardware. - If too many pkeys are in use for the hardware to support, this - will return EAGAIN, and the caller should then fail and exit - or use the default key and check the pkey in the received packet - checking. */ -/* set send context pkey to verify, error if driver is not configured with */ -/* this pkey in its pkey table. */ -int psm3_gen1_set_pkey(struct _hfi_ctrl *, uint16_t); - -int psm3_gen1_wait_for_packet(struct _hfi_ctrl *); - -/* New user event mechanism, using spi_sendbuf_status HFI_EVENT_* bits - obsoletes hfi_disarm_bufs(), and extends it, although old mechanism - remains for binary compatibility. */ -int psm3_gen1_event_ack(struct _hfi_ctrl *ctrl, __u64 ackbits); - -/* set whether we want an interrupt on all packets, or just urgent ones */ -int psm3_gen1_poll_type(struct _hfi_ctrl *ctrl, uint16_t poll_type); - -/* reset halted send context, error if context is not halted. */ -int psm3_gen1_nic_reset_context(struct _hfi_ctrl *ctrl); - -static __inline__ void psm3_gen1_tidflow_set_entry(struct _hfi_ctrl *ctrl, - uint32_t flowid, uint32_t genval, - uint32_t seqnum) -{ -/* For proper behavior with RSM interception of FECN packets for CCA, - * the tidflow entry needs the KeepAfterSequenceError bit set. - * A packet that is converted from expected to eager by RSM will not - * trigger an update in the tidflow state. This will cause the tidflow - * to incorrectly report a sequence error on any non-FECN packets that - * arrive after the RSM intercepted packets. If the KeepAfterSequenceError - * bit is set, PSM can properly detect this "false SeqErr" condition, - * and recover without dropping packets. - * Note that if CCA/RSM are not important, this change will slightly - * increase the CPU load when packets are dropped. If this is significant, - * consider hiding this change behind a CCA/RSM environment variable. - */ - - ctrl->__hfi_rcvtidflow[flowid] = __cpu_to_le64( - ((genval & HFI_TF_GENVAL_MASK) << HFI_TF_GENVAL_SHIFT) | - ((seqnum & HFI_TF_SEQNUM_MASK) << HFI_TF_SEQNUM_SHIFT) | - ((uint64_t)ctrl->__hfi_tfvalid << HFI_TF_FLOWVALID_SHIFT) | - (1ULL << HFI_TF_HDRSUPP_ENABLED_SHIFT) | - /* KeepAfterSequenceError = 1 -- previously was 0 */ - (1ULL << HFI_TF_KEEP_AFTER_SEQERR_SHIFT) | - (1ULL << HFI_TF_KEEP_ON_GENERR_SHIFT) | - /* KeePayloadOnGenErr = 0 */ - (1ULL << HFI_TF_STATUS_SEQMISMATCH_SHIFT) | - (1ULL << HFI_TF_STATUS_GENMISMATCH_SHIFT)); -} - -static __inline__ void psm3_gen1_tidflow_reset(struct _hfi_ctrl *ctrl, - uint32_t flowid, uint32_t genval, - uint32_t seqnum) -{ -/* - * If a tidflow table entry is set to "Invalid", we want to drop - * header if payload is dropped, we want to get a header if the payload - * is delivered. - * - * We set a tidflow table entry "Invalid" by setting FlowValid=1 and - * GenVal=0x1FFF/0xFFFFF, this is a special generation number and no - * packet will use this value. We don't care SeqNum but we set it to - * 0x7FF. So if GenVal does not match, the payload is dropped because - * KeepPayloadOnGenErr=0; for packet header, KeepOnGenErr=0 make sure - * header is not generated. But if a packet happens to have the special - * generation number, the payload is delivered, HdrSuppEnabled=0 make - * sure header is generated if SeqNUm matches, if SeqNum does not match, - * KeepAfterSeqErr=1 makes sure the header is generated. - */ - ctrl->__hfi_rcvtidflow[flowid] = __cpu_to_le64( - /* genval = 0x1FFF or 0xFFFFF */ - ((genval & HFI_TF_GENVAL_MASK) << HFI_TF_GENVAL_SHIFT) | - /* seqnum = 0x7FF */ - ((seqnum & HFI_TF_SEQNUM_MASK) << HFI_TF_SEQNUM_SHIFT) | - ((uint64_t)ctrl->__hfi_tfvalid << HFI_TF_FLOWVALID_SHIFT) | - /* HdrSuppEnabled = 0 */ - (1ULL << HFI_TF_KEEP_AFTER_SEQERR_SHIFT) | - /* KeepOnGenErr = 0 */ - /* KeepPayloadOnGenErr = 0 */ - (1ULL << HFI_TF_STATUS_SEQMISMATCH_SHIFT) | - (1ULL << HFI_TF_STATUS_GENMISMATCH_SHIFT)); -} - -/* - * This should only be used for debugging. - * Normally, we shouldn't read the chip. - */ -static __inline__ uint64_t psm3_gen1_tidflow_get(struct _hfi_ctrl *ctrl, - uint32_t flowid) -{ - return __le64_to_cpu(ctrl->__hfi_rcvtidflow[flowid]); -} - -static __inline__ uint32_t psm3_gen1_tidflow_get_seqnum(uint64_t val) -{ - return (val >> HFI_TF_SEQNUM_SHIFT) & HFI_TF_SEQNUM_MASK; -} - -static __inline__ uint32_t psm3_gen1_tidflow_get_genval(uint64_t val) -{ - return (val >> HFI_TF_GENVAL_SHIFT) & HFI_TF_GENVAL_MASK; -} - -static __inline__ uint32_t psm3_gen1_tidflow_get_flowvalid(uint64_t val) -{ - return (val >> HFI_TF_FLOWVALID_SHIFT) & HFI_TF_FLOWVALID_MASK; -} - -static __inline__ uint32_t psm3_gen1_tidflow_get_enabled(uint64_t val) -{ - return (val >> HFI_TF_HDRSUPP_ENABLED_SHIFT) & - HFI_TF_HDRSUPP_ENABLED_MASK; -} - -static __inline__ uint32_t psm3_gen1_tidflow_get_keep_after_seqerr(uint64_t val) -{ - return (val >> HFI_TF_KEEP_AFTER_SEQERR_SHIFT) & - HFI_TF_KEEP_AFTER_SEQERR_MASK; -} - -static __inline__ uint32_t psm3_gen1_tidflow_get_keep_on_generr(uint64_t val) -{ - return (val >> HFI_TF_KEEP_ON_GENERR_SHIFT) & - HFI_TF_KEEP_ON_GENERR_MASK; -} - -static __inline__ uint32_t psm3_gen1_tidflow_get_keep_payload_on_generr(uint64_t val) -{ - return (val >> HFI_TF_KEEP_PAYLOAD_ON_GENERR_SHIFT) & - HFI_TF_KEEP_PAYLOAD_ON_GENERR_MASK; -} - -/* For tidflow_get_seqmismatch and tidflow_get_genmismatch, if - val was obtained from tidflow_get_hw(), then these will be valid - but, if val was obtained from tidflow_get(), then these will - always return 0. */ -static __inline__ uint32_t psm3_gen1_tidflow_get_seqmismatch(uint64_t val) -{ - return (val >> HFI_TF_STATUS_SEQMISMATCH_SHIFT) & - HFI_TF_STATUS_SEQMISMATCH_MASK; -} - -static __inline__ uint32_t psm3_gen1_tidflow_get_genmismatch(uint64_t val) -{ - return (val >> HFI_TF_STATUS_GENMISMATCH_SHIFT) & - HFI_TF_STATUS_GENMISMATCH_MASK; -} - -/* - * This should only be used by a process to write the eager index into - * a subcontext's eager header entry. - */ -static __inline__ void psm3_gen1_hdrset_use_egrbfr(__le32 *rbuf, uint32_t val) -{ - rbuf[0] = - (rbuf[0] & - __cpu_to_le32(~(HFI_RHF_USE_EGRBFR_MASK << - HFI_RHF_USE_EGRBFR_SHIFT))) | - __cpu_to_le32((val & HFI_RHF_USE_EGRBFR_MASK) << - HFI_RHF_USE_EGRBFR_SHIFT); -} - -static __inline__ void psm3_gen1_hdrset_egrbfr_index(__le32 *rbuf, uint32_t val) -{ - rbuf[0] = - (rbuf[0] & - __cpu_to_le32(~(HFI_RHF_EGRBFR_INDEX_MASK << - HFI_RHF_EGRBFR_INDEX_SHIFT))) | - __cpu_to_le32((val & HFI_RHF_EGRBFR_INDEX_MASK) << - HFI_RHF_EGRBFR_INDEX_SHIFT); -} - -static __inline__ void psm3_gen1_hdrset_egrbfr_offset(__le32 *rbuf, uint32_t val) -{ - rbuf[1] = - (rbuf[1] & - __cpu_to_le32(~(HFI_RHF_EGRBFR_OFFSET_MASK << - HFI_RHF_EGRBFR_OFFSET_SHIFT))) | - __cpu_to_le32((val & HFI_RHF_EGRBFR_OFFSET_MASK) << - HFI_RHF_EGRBFR_OFFSET_SHIFT); -} - -/* - * This should only be used by a process to update the receive header - * error flags. - */ -static __inline__ void psm3_gen1_hdrset_err_flags(__le32 *rbuf, uint32_t val) -{ - rbuf[1] |= __cpu_to_le32(val); -} - -/* - * This should only be used by a process to write the rhf seq number into - * a subcontext's eager header entry. - */ -static __inline__ void psm3_gen1_hdrset_seq(__le32 *rbuf, uint32_t val) -{ - rbuf[0] = - (rbuf[0] & - __cpu_to_le32(~(HFI_RHF_SEQ_MASK << - HFI_RHF_SEQ_SHIFT))) | - __cpu_to_le32((val & HFI_RHF_SEQ_MASK) << HFI_RHF_SEQ_SHIFT); -} - -/* Manage TID entries. It is possible that not all entries - requested may be allocated. A matching psm3_gen1_free_tid() must be - done for each psm3_gen1_update_tid(), because currently no caching or - reuse of expected tid entries is allowed, to work around malloc/free - and mmap/munmap issues. The driver decides which TID entries to allocate. - If psm3_gen1_free_tid is called to free entries in use by a different - send by the same process, data corruption will probably occur, - but only within that process, not for other processes. -*/ - -/* update tidcnt expected TID entries from the array pointed to by tidinfo. */ -/* Returns 0 on success, else an errno. See full description at declaration */ -static __inline__ int32_t psm3_gen1_update_tid(struct _hfi_ctrl *ctrl, - uint64_t vaddr, uint32_t *length, - uint64_t tidlist, uint32_t *tidcnt, uint16_t flags) -{ - struct hfi1_cmd cmd; - struct hfi1_tid_info tidinfo; -#ifdef PSM_CUDA - struct hfi1_tid_info_v2 tidinfov2; -#endif - int err; - - tidinfo.vaddr = vaddr; /* base address for this send to map */ - tidinfo.length = *length; /* length of vaddr */ - - tidinfo.tidlist = tidlist; /* driver copies tids back directly */ - tidinfo.tidcnt = 0; /* clear to zero */ - - cmd.type = PSMI_HFI_CMD_TID_UPDATE; - cmd.len = sizeof(tidinfo); - cmd.addr = (__u64) &tidinfo; -#ifdef PSM_CUDA - if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED) { - /* Copy values to v2 struct */ - tidinfov2.vaddr = tidinfo.vaddr; - tidinfov2.length = tidinfo.length; - tidinfov2.tidlist = tidinfo.tidlist; - tidinfov2.tidcnt = tidinfo.tidcnt; - tidinfov2.flags = flags; - - cmd.type = PSMI_HFI_CMD_TID_UPDATE_V2; - cmd.len = sizeof(tidinfov2); - cmd.addr = (__u64) &tidinfov2; - } -#endif - - err = psm3_gen1_nic_cmd_write(ctrl->fd, &cmd, sizeof(cmd)); - - if (err != -1) { - struct hfi1_tid_info *rettidinfo = - (struct hfi1_tid_info *)cmd.addr; - *length = rettidinfo->length; - *tidcnt = rettidinfo->tidcnt; - } - - return err; -} - -static __inline__ int32_t psm3_gen1_free_tid(struct _hfi_ctrl *ctrl, - uint64_t tidlist, uint32_t tidcnt) -{ - struct hfi1_cmd cmd; - struct hfi1_tid_info tidinfo; - int err; - - tidinfo.tidlist = tidlist; /* input to driver */ - tidinfo.tidcnt = tidcnt; - - cmd.type = PSMI_HFI_CMD_TID_FREE; - cmd.len = sizeof(tidinfo); - cmd.addr = (__u64) &tidinfo; - - err = psm3_gen1_nic_cmd_write(ctrl->fd, &cmd, sizeof(cmd)); - - return err; -} - -static __inline__ int32_t psm3_gen1_get_invalidation(struct _hfi_ctrl *ctrl, - uint64_t tidlist, uint32_t *tidcnt) -{ - struct hfi1_cmd cmd; - struct hfi1_tid_info tidinfo; - int err; - - tidinfo.tidlist = tidlist; /* driver copies tids back directly */ - tidinfo.tidcnt = 0; /* clear to zero */ - - cmd.type = PSMI_HFI_CMD_TID_INVAL_READ; - cmd.len = sizeof(tidinfo); - cmd.addr = (__u64) &tidinfo; - - err = psm3_gen1_nic_cmd_write(ctrl->fd, &cmd, sizeof(cmd)); - - if (err != -1) - *tidcnt = tidinfo.tidcnt; - - return err; -} - -/* - * Data layout in I2C flash (for GUID, etc.) - * All fields are little-endian binary unless otherwise stated - */ -#define HFI_FLASH_VERSION 2 -struct hfi_flash { - /* flash layout version (HFI_FLASH_VERSION) */ - __u8 if_fversion; - /* checksum protecting if_length bytes */ - __u8 if_csum; - /* - * valid length (in use, protected by if_csum), including - * if_fversion and if_csum themselves) - */ - __u8 if_length; - /* the GUID, in network order */ - __u8 if_guid[8]; - /* number of GUIDs to use, starting from if_guid */ - __u8 if_numguid; - /* the (last 10 characters of) board serial number, in ASCII */ - char if_serial[12]; - /* board mfg date (YYYYMMDD ASCII) */ - char if_mfgdate[8]; - /* last board rework/test date (YYYYMMDD ASCII) */ - char if_testdate[8]; - /* logging of error counts, TBD */ - __u8 if_errcntp[4]; - /* powered on hours, updated at driver unload */ - __u8 if_powerhour[2]; - /* ASCII free-form comment field */ - char if_comment[32]; - /* Backwards compatible prefix for longer QLogic Serial Numbers */ - char if_sprefix[4]; - /* 82 bytes used, min flash size is 128 bytes */ - __u8 if_future[46]; -}; - -#endif /* PSM_HAL_GEN1_USER_H */ -#endif /* PSM_OPA */ diff --git a/psm3/hal_gen1/gen1_utils.c b/psm3/hal_gen1/gen1_utils.c deleted file mode 100644 index bbcf765..0000000 --- a/psm3/hal_gen1/gen1_utils.c +++ /dev/null @@ -1,401 +0,0 @@ -#ifdef PSM_OPA -/* - - This file is provided under a dual BSD/GPLv2 license. When using or - redistributing this file, you may do so under either license. - - GPL LICENSE SUMMARY - - Copyright(c) 2015 Intel Corporation. - - This program is free software; you can redistribute it and/or modify - it under the terms of version 2 of the GNU General Public License as - published by the Free Software Foundation. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - Contact Information: - Intel Corporation, www.intel.com - - BSD LICENSE - - Copyright(c) 2015 Intel Corporation. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -/* This file contains hfi service routine interface used by the low */ -/* level hfi protocol code. */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "gen1_user.h" - -/* touch the pages, with a 32 bit read */ -void psm3_gen1_touch_mmap(void *m, size_t bytes) -{ - volatile uint32_t *b = (volatile uint32_t *)m, c; - size_t i; /* m is always page aligned, so pgcnt exact */ - int __hfi_pg_sz; - - /* First get the page size */ - __hfi_pg_sz = sysconf(_SC_PAGESIZE); - - _HFI_VDBG("Touch %lu mmap'ed pages starting at %p\n", - (unsigned long)bytes / __hfi_pg_sz, m); - bytes /= sizeof(c); - for (i = 0; i < bytes; i += __hfi_pg_sz / sizeof(c)) - c = b[i]; -} - -/* ack event bits, and clear them. Usage is check *spi_sendbuf_status, - pass bits you are prepared to handle to psm3_gen1_event_ack(), perform the - appropriate actions for bits that were set, and then (if appropriate) - check the bits again. */ -int psm3_gen1_event_ack(struct _hfi_ctrl *ctrl, __u64 ackbits) -{ - struct hfi1_cmd cmd; - - cmd.type = PSMI_HFI_CMD_ACK_EVENT; - cmd.len = 0; - cmd.addr = ackbits; - - if (psm3_gen1_nic_cmd_write(ctrl->fd, &cmd, sizeof(cmd)) == -1) { - if (errno != EINVAL) /* not implemented in driver. */ - _HFI_DBG("event ack failed: %s\n", strerror(errno)); - return -1; - } - return 0; -} - -/* Tell the driver to change the way packets can generate interrupts. - - HFI1_POLL_TYPE_URGENT: Generate interrupt only when packet sets - HFI_KPF_INTR - HFI1_POLL_TYPE_ANYRCV: wakeup on any rcv packet (when polled on). - - PSM: Uses TYPE_URGENT in ips protocol -*/ -int psm3_gen1_poll_type(struct _hfi_ctrl *ctrl, uint16_t poll_type) -{ - struct hfi1_cmd cmd; - - cmd.type = PSMI_HFI_CMD_POLL_TYPE; - cmd.len = 0; - cmd.addr = (uint64_t) poll_type; - - if (psm3_gen1_nic_cmd_write(ctrl->fd, &cmd, sizeof(cmd)) == -1) { - if (errno != EINVAL) /* not implemented in driver */ - _HFI_INFO("poll type failed: %s\n", strerror(errno)); - return -1; - } - return 0; -} - -/* set the send context pkey to check BTH pkey in each packet. - driver should check its pkey table to see if it can find - this pkey, if not, driver should return error. */ -int psm3_gen1_set_pkey(struct _hfi_ctrl *ctrl, uint16_t pkey) -{ - struct hfi1_cmd cmd; - struct hfi1_base_info tbinfo; - - cmd.type = PSMI_HFI_CMD_SET_PKEY; - cmd.len = 0; - cmd.addr = (uint64_t) pkey; - - _HFI_VDBG("Setting context pkey to 0x%04x.\n", pkey); - if (psm3_gen1_nic_cmd_write(ctrl->fd, &cmd, sizeof(cmd)) == -1) { - _HFI_INFO("Setting context pkey to 0x%04x failed: %s\n", - pkey, strerror(errno)); - return -1; - } else { - _HFI_VDBG("Successfully set context pkey to 0x%04x.\n", pkey); - } - - if (getenv("PSM3_SELINUX")) { - /* - * If SELinux is in use the kernel may have changed our JKey based on - * what we supply for the PKey so go ahead and interrogate the user info - * again and update our saved copy. In the future there may be a new - * IOCTL to get the JKey only. For now, this temporary workaround works. - */ - cmd.type = PSMI_HFI_CMD_USER_INFO; - cmd.len = sizeof(tbinfo); - cmd.addr = (uint64_t) &tbinfo; - - if (psm3_gen1_nic_cmd_write(ctrl->fd, &cmd, sizeof(cmd)) == -1) { - _HFI_VDBG("BASE_INFO command failed in setpkey: %s\n", - strerror(errno)); - return -1; - } - _HFI_VDBG("PSM3_SELINUX is set, updating jkey to 0x%04x\n", tbinfo.jkey); - ctrl->base_info.jkey = tbinfo.jkey; - } - return 0; -} - -/* Tell the driver to reset the send context. if the send context - if halted, reset it, if not, return error back to caller. - After context reset, the credit return should be reset to - zero by a hardware credit return DMA. - Driver will return ENOLCK if the reset is timeout, in this - case PSM needs to re-call again. */ -int psm3_gen1_nic_reset_context(struct _hfi_ctrl *ctrl) -{ - struct hfi1_cmd cmd; - - cmd.type = PSMI_HFI_CMD_CTXT_RESET; - cmd.len = 0; - cmd.addr = 0; - -retry: - if (psm3_gen1_nic_cmd_write(ctrl->fd, &cmd, sizeof(cmd)) == -1) { - if (errno == ENOLCK) - goto retry; - - if (errno != EINVAL) - _HFI_INFO("reset ctxt failed: %s\n", strerror(errno)); - return -1; - } - return 0; -} - -/* wait for a received packet for our context - This allows us to not busy wait, if nothing has happened for a - while, which allows better measurements of cpu utilization, and - in some cases, slightly better performance. Called where we would - otherwise call sched_yield(). It is not guaranteed that a packet - has arrived, so the normal checking loop(s) should be done. - - PSM: not used as is, PSM has it's own use of polling for interrupt-only - packets (sets psm3_gen1_poll_type to TYPE_URGENT) */ -int psm3_gen1_wait_for_packet(struct _hfi_ctrl *ctrl) -{ - return psm3_gen1_cmd_wait_for_packet(ctrl->fd); -} - -const char *psm3_gen1_get_next_name(char **names) -{ - char *p, *start; - - p = start = *names; - while (*p != '\0' && *p != '\n') { - p++; - } - if (*p == '\n') { - *p = '\0'; - p++; - *names = p; - return start; - } else - return NULL; -} - -void psm3_gen1_release_names(char *namep) -{ - /* names are allocated when hfi_hfifs_read() is called. Allocation - * for names is done only once at init time. Should we eventually - * have an "stats_type_unregister" type of routine to explicitly - * deallocate memory and free resources ? - */ -#if 0 - if (namep != NULL) - psm3_hfifs_free(namep); -#endif -} - -/* These have been fixed to read the values, but they are not - * compatible with the hfi driver, they return new info with - * the qib driver - */ -static int psm3_gen1_count_names(const char *namep) -{ - int n = 0; - while (*namep != '\0') { - if (*namep == '\n') - n++; - namep++; - } - return n; -} - -static int psm3_gen1_lookup_stat(const char *attr, char *namep, uint64_t *stats, - uint64_t *s) -{ - const char *p; - int i, ret = -1, len = strlen(attr); - int nelem = psm3_gen1_count_names(namep); - - for (i = 0; i < nelem; i++) { - p = psm3_gen1_get_next_name(&namep); - if (p == NULL) - break; - if (strncasecmp(p, attr, len + 1) == 0) { - ret = i; - *s = stats[i]; - } - } - return ret; -} - -int psm3_gen1_get_single_portctr(int unit, int port, const char *attr, uint64_t *s) -{ - int nelem, n = 0, ret = -1; - char *namep = NULL; - uint64_t *stats = NULL; - - nelem = psm3_gen1_get_ctrs_port_names(unit, &namep); - if (nelem == -1 || namep == NULL) - goto bail; - stats = calloc(nelem, sizeof(uint64_t)); - if (stats == NULL) - goto bail; - n = psm3_gen1_get_ctrs_port(unit, port, stats, nelem); - if (n != nelem) - goto bail; - ret = psm3_gen1_lookup_stat(attr, namep, stats, s); -bail: - if (namep != NULL) - psm3_hfifs_free(namep); - if (stats != NULL) - free(stats); - return ret; -} - -int psm3_gen1_get_stats_names_count() -{ - char *namep; - int c; - - c = psm3_gen1_get_stats_names(&namep); - psm3_hfifs_free(namep); - return c; -} - -int psm3_gen1_get_ctrs_unit_names_count(int unitno) -{ - char *namep; - int c; - - c = psm3_gen1_get_ctrs_unit_names(unitno, &namep); - psm3_hfifs_free(namep); - return c; -} - -int psm3_gen1_get_ctrs_port_names_count(int unitno) -{ - char *namep; - int c; - - c = psm3_gen1_get_ctrs_port_names(unitno, &namep); - psm3_hfifs_free(namep); - return c; -} - -/* These have been fixed to read the values, but they are not - * compatible with the hfi driver, they return new info with - * the qib driver - */ -int psm3_gen1_get_ctrs_unit_names(int unitno, char **namep) -{ - int i; - i = psm3_hfifs_unit_read(unitno, "counter_names", namep); - if (i < 0) - return -1; - else - return psm3_gen1_count_names(*namep); -} - -int psm3_gen1_get_ctrs_unit(int unitno, uint64_t *c, int nelem) -{ - int i; - i = psm3_hfifs_unit_rd(unitno, "counters", c, nelem * sizeof(*c)); - if (i < 0) - return -1; - else - return i / sizeof(*c); -} - -int psm3_gen1_get_ctrs_port_names(int unitno, char **namep) -{ - int i; - i = psm3_hfifs_unit_read(unitno, "portcounter_names", namep); - if (i < 0) - return -1; - else - return psm3_gen1_count_names(*namep); -} - -int psm3_gen1_get_ctrs_port(int unitno, int port, uint64_t *c, int nelem) -{ - int i; - char buf[32]; - snprintf(buf, sizeof(buf), "port%dcounters", port); - i = psm3_hfifs_unit_rd(unitno, buf, c, nelem * sizeof(*c)); - if (i < 0) - return -1; - else - return i / sizeof(*c); -} - -int psm3_gen1_get_stats_names(char **namep) -{ - int i; - i = psm3_hfifs_read("driver_stats_names", namep); - if (i < 0) - return -1; - else - return psm3_gen1_count_names(*namep); -} - -int psm3_gen1_get_stats(uint64_t *s, int nelem) -{ - int i; - i = psm3_hfifs_rd("driver_stats", s, nelem * sizeof(*s)); - if (i < 0) - return -1; - else - return i / sizeof(*s); -} -#endif /* PSM_OPA */ diff --git a/psm3/hal_verbs/verbs_ep.c b/psm3/hal_verbs/verbs_ep.c index 57bffc0..5c649fa 100644 --- a/psm3/hal_verbs/verbs_ep.c +++ b/psm3/hal_verbs/verbs_ep.c @@ -2000,23 +2000,6 @@ static psm2_error_t verbs_open_dev(psm2_ep_t ep, int unit, int port, int addr_in } #if defined(USE_RC) -#if defined(USE_RDMA_READ) - { - struct ibv_device_attr dev_attr; - // get RDMA capabilities of device - if (ibv_query_device(ep->verbs_ep.context, &dev_attr)) { - _HFI_ERROR("Unable query device %s: %s\n", ep->dev_name, - strerror(errno)); - err = PSM2_INTERNAL_ERR; - goto fail; - } - ep->verbs_ep.max_qp_rd_atom = dev_attr.max_qp_rd_atom; - ep->verbs_ep.max_qp_init_rd_atom = dev_attr.max_qp_init_rd_atom; - _HFI_PRDBG("got device attr: rd_atom %u init_rd_atom %u\n", - dev_attr.max_qp_rd_atom, dev_attr.max_qp_init_rd_atom); - // TBD could have an env variable to reduce requested values - } -#endif #endif // USE_RC #ifdef UMR_CACHE if (ep->mr_cache_mode == MR_CACHE_MODE_USER) { @@ -2348,9 +2331,6 @@ psm2_error_t modify_rc_qp_to_init(psm2_ep_t ep, struct ibv_qp *qp) //attr.qkey = ep->verbs_ep.qkey; //flags |= IBV_QP_QKEY; // only allowed for UD attr.qp_access_flags = 0; -#ifdef USE_RDMA_READ - attr.qp_access_flags |= IBV_ACCESS_REMOTE_READ; -#endif attr.qp_access_flags |= IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE; //attr.qp_access_flags |= IBV_ACCESS_REMOTE_ATOMIC; flags |= IBV_QP_ACCESS_FLAGS; @@ -2386,10 +2366,6 @@ psm2_error_t modify_rc_qp_to_rtr(psm2_ep_t ep, struct ibv_qp *qp, attr.rq_psn = initpsn; flags |= (IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN); -#ifdef USE_RDMA_READ - attr.max_dest_rd_atomic = min(ep->verbs_ep.max_qp_rd_atom, - req_attr->initiator_depth); -#endif _HFI_PRDBG("set max_dest_rd_atomic to %u\n", attr.max_dest_rd_atomic); attr.min_rnr_timer = 12; // TBD well known flags |= (IBV_QP_MIN_RNR_TIMER | IBV_QP_MAX_DEST_RD_ATOMIC); @@ -2417,10 +2393,6 @@ psm2_error_t modify_rc_qp_to_rts(psm2_ep_t ep, struct ibv_qp *qp, attr.sq_psn = initpsn; // value we told other side flags |= IBV_QP_SQ_PSN; -#ifdef USE_RDMA_READ - attr.max_rd_atomic = min(ep->verbs_ep.max_qp_init_rd_atom, - req_attr->responder_resources); -#endif _HFI_PRDBG("set max_rd_atomic to %u\n", attr.max_rd_atomic); flags |= IBV_QP_MAX_QP_RD_ATOMIC; diff --git a/psm3/hal_verbs/verbs_ep.h b/psm3/hal_verbs/verbs_ep.h index 4c839d6..1bc4e62 100644 --- a/psm3/hal_verbs/verbs_ep.h +++ b/psm3/hal_verbs/verbs_ep.h @@ -300,10 +300,6 @@ struct psm3_verbs_ep { //uint8_t link_layer; // IBV_LINK_LAYER_ETHERNET or other uint8_t active_rate; #if defined(USE_RC) -#if defined(USE_RDMA_READ) - uint8_t max_qp_rd_atom; - uint8_t max_qp_init_rd_atom; -#endif #endif // USE_RC struct psm3_verbs_send_pool send_pool; struct psm3_verbs_send_allocator send_allocator; diff --git a/psm3/hal_verbs/verbs_hal_inline_i.h b/psm3/hal_verbs/verbs_hal_inline_i.h index 30a6a50..15a8b1e 100644 --- a/psm3/hal_verbs/verbs_hal_inline_i.h +++ b/psm3/hal_verbs/verbs_hal_inline_i.h @@ -400,14 +400,8 @@ static PSMI_HAL_INLINE void psm3_hfp_verbs_ips_proto_build_connect_message( req->verbs.qp_attr.resv = 0; req->verbs.qp_attr.target_ack_delay = 0; // TBD; - from local device req->verbs.qp_attr.resv2 = 0; -#ifdef USE_RDMA_READ - // Send our RDMA Read capabilities - req->verbs.qp_attr.responder_resources = proto->ep->verbs_ep.max_qp_rd_atom; - req->verbs.qp_attr.initiator_depth = proto->ep->verbs_ep.max_qp_init_rd_atom; -#else req->verbs.qp_attr.responder_resources = 0; req->verbs.qp_attr.initiator_depth = 0; -#endif memset(&req->verbs.qp_attr.resv3, 0, sizeof(req->verbs.qp_attr.resv3)); } else #endif // USE_RC diff --git a/psm3/include/utils_debug.h b/psm3/include/utils_debug.h index bfbdf37..2df106e 100644 --- a/psm3/include/utils_debug.h +++ b/psm3/include/utils_debug.h @@ -86,10 +86,6 @@ #define __HFI_MMDBG 0x200 /* low-level environment variables */ #define __HFI_ENVDBG 0x400 -#ifdef PSM_OPA -#define __HFI_EPKTDBG 0x800 /* print error packet data */ -#define __HFI_CCADBG 0x1000 /* print CCA related events */ -#endif #define __HFI_DEBUG_DEFAULT __HFI_INFO #define __HFI_DEBUG_DEFAULT_STR "0x0001" @@ -110,10 +106,6 @@ #define __HFI_PROCDBG 0x0 /* print process startup (init)/exit messages */ /* print MR, mmap/nopage stuff, not using VDBG any more */ #define __HFI_MMDBG 0x0 -#ifdef PSM_OPA -#define __HFI_EPKTDBG 0x0 /* print error packet data */ -#define __HFI_CCADBG 0x0 /* print CCA related events */ -#endif #define __HFI_DEBUG_DEFAULT __HFI_INFO #define __HFI_DEBUG_DEFAULT_STR "0x0000" @@ -209,10 +201,6 @@ extern void psm3_dump_gpu_buf(uint8_t *buf, uint32_t len); (lev == 0) ? __HFI_INFO : __HFI_ENVDBG,\ "env " fmt, ##__VA_ARGS__) #define _HFI_MMDBG(fmt, ...) __HFI_DBG_WHICH(__HFI_MMDBG, fmt, ##__VA_ARGS__) -#ifdef PSM_OPA -#define _HFI_EPDBG(fmt, ...) __HFI_DBG_WHICH(__HFI_EPKTDBG, fmt, ##__VA_ARGS__) -#define _HFI_CCADBG(fmt, ...) __HFI_DBG_WHICH(__HFI_CCADBG, fmt, ##__VA_ARGS__) -#endif /* * Use these macros (_HFI_DBG_ON and _HFI_DBG_ALWAYS) together @@ -249,10 +237,6 @@ extern void psm3_dump_gpu_buf(uint8_t *buf, uint32_t len); #define _HFI_MMDBG_ON unlikely(psm3_dbgmask & __HFI_MMDBG) #define _HFI_MMDBG_ALWAYS(fmt, ...) _HFI_DBG_ALWAYS(fmt, ##__VA_ARGS__) -#ifdef PSM_OPA -#define _HFI_CCADBG_ON unlikely(psm3_dbgmask & __HFI_CCADBG) -#define _HFI_CCADBG_ALWAYS(fmt, ...) _HFI_DBG_ALWAYS(fmt, ##__VA_ARGS__) -#endif #define _HFI_INFO_ON unlikely(psm3_dbgmask & __HFI_INFO) #define _HFI_INFO_ALWAYS(fmt, ...) _HFI_DBG_ALWAYS(fmt, ##__VA_ARGS__) @@ -280,10 +264,6 @@ extern void psm3_dump_gpu_buf(uint8_t *buf, uint32_t len); #define _HFI_CONNDBG(fmt, ...) #define _HFI_VDBG(fmt, ...) #define _HFI_MMDBG(fmt, ...) -#ifdef PSM_OPA -#define _HFI_EPDBG(fmt, ...) -#define _HFI_CCADBG(fmt, ...) -#endif #define _HFI_DBG_ON 0 #define _HFI_DBG_ALWAYS(fmt, ...) @@ -299,10 +279,6 @@ extern void psm3_dump_gpu_buf(uint8_t *buf, uint32_t len); #if defined(PSM_CUDA) || defined(PSM_ONEAPI) #define _HFI_PDBG_DUMP_GPU_ALWAYS(buf, len) #endif -#ifdef PSM_OPA -#define _HFI_CCADBG_ON 0 -#define _HFI_CCADBG_ALWAYS(fmt, ...) -#endif #define _HFI_INFO_ON 0 #define _HFI_INFO_ALWAYS(fmt, ...) diff --git a/psm3/include/utils_sysfs.h b/psm3/include/utils_sysfs.h index 54c774c..ec1143a 100644 --- a/psm3/include/utils_sysfs.h +++ b/psm3/include/utils_sysfs.h @@ -117,18 +117,6 @@ int psm3_sysfs_port_read_s64(uint32_t unit, uint32_t port, const char *attr, int64_t *valp, int base); int64_t psm3_sysfs_unit_read_node_s64(uint32_t unit); -#ifdef PSM_OPA -void psm3_hfifs_free(char *data); -/* read up to one page of malloc'ed data returning - number of bytes read or -1 */ -/* caller must use psm3_hfifs_free to free *datap */ -int psm3_hfifs_read(const char *attr, char **datap); -int psm3_hfifs_unit_read(uint32_t unit, const char *attr, char **data); - -/* these read directly into supplied buffer and take a count */ -int psm3_hfifs_rd(const char *, void *, int); -int psm3_hfifs_unit_rd(uint32_t unit, const char *, void *, int); -#endif /* Given a unit number, return an error, or the corresponding cpuset. */ /* Returns an int, so -1 indicates an error. */ diff --git a/psm3/include/utils_user.h b/psm3/include/utils_user.h index 23e6bb5..6b49b75 100644 --- a/psm3/include/utils_user.h +++ b/psm3/include/utils_user.h @@ -89,7 +89,6 @@ #define HFI_TF_NFLOWS 32 -#ifndef PSM_OPA // The sender uses an RDMA Write with Immediate. The immediate data // carries the receiver's desc genc and idx from which the receiver can // locate the ips_tid_recv_desc @@ -109,7 +108,6 @@ // source of the immediate callback #define RDMA_IMMED_USER_RC 0 // from a user space RC QP #define RDMA_IMMED_RV 1 // from RV module kernel QP -#endif /* IB - LRH header consts */ #define HFI_LRH_BTH 0x0002 /* 1. word of IB LRH - next header: BTH */ @@ -124,18 +122,8 @@ #define HFI_BTH_OPCODE_SHIFT 24 #define HFI_BTH_OPCODE_MASK 0xff // bth[1] -#ifdef PSM_OPA -#define HFI_BTH_BECN_SHIFT 30 -#define HFI_BTH_FECN_SHIFT 31 -#define HFI_BTH_QP_SHIFT 16 -#define HFI_BTH_QP_MASK 0xff -#endif #define HFI_BTH_FLOWID_SHIFT 11 #define HFI_BTH_FLOWID_MASK 0x1f -#ifdef PSM_OPA -#define HFI_BTH_SUBCTXT_SHIFT 8 -#define HFI_BTH_SUBCTXT_MASK 0x7 -#endif // bth[2] #define HFI_BTH_SEQ_SHIFT 0 #define HFI_BTH_SEQ_MASK 0x7ff // tidflow sequence number @@ -147,12 +135,6 @@ /* KDETH header consts */ #define HFI_KHDR_OFFSET_MASK 0x7fff #define HFI_KHDR_OM_SHIFT 15 -#ifdef PSM_OPA -#define HFI_KHDR_TID_SHIFT 16 -#define HFI_KHDR_TID_MASK 0x3ff -#define HFI_KHDR_TIDCTRL_SHIFT 26 -#define HFI_KHDR_TIDCTRL_MASK 0x3 -#endif #define HFI_KHDR_INTR_SHIFT 28 #define HFI_KHDR_SH_SHIFT 29 #define HFI_KHDR_KVER_SHIFT 30 @@ -162,11 +144,6 @@ #define HFI_KHDR_TINYLEN_MASK 0xf #define HFI_KHDR_TINYLEN_SHIFT 16 -#ifdef PSM_OPA -#define GET_HFI_KHDR_TIDCTRL(val) \ - (((val) >> HFI_KHDR_TIDCTRL_SHIFT) & \ - HFI_KHDR_TIDCTRL_MASK) -#endif #if defined(PSM_CUDA) || defined(PSM_ONEAPI) extern int is_driver_gpudirect_enabled; @@ -195,12 +172,8 @@ struct hfi_kdeth { #define HFI_CRC_SIZE_IN_BYTES 4 #endif -#ifndef PSM_OPA //#define HFI_DEFAULT_SERVICE_ID 0 /* let rv module decide */ #define HFI_DEFAULT_SERVICE_ID 0x1000125500000001ULL -#else -#define HFI_DEFAULT_SERVICE_ID 0x1000117500000000ULL -#endif #if 0 #define HFI_PERMISSIVE_LID 0xFFFF @@ -214,16 +187,6 @@ struct hfi_kdeth { #define HFI_MULTICAST_QPN 0xFFFFFF #endif -#ifdef PSM_OPA -/* Receive Header Queue: receive type (from hfi) */ -#define RCVHQ_RCV_TYPE_EXPECTED 0 -#define RCVHQ_RCV_TYPE_EAGER 1 -#define RCVHQ_RCV_TYPE_NON_KD 2 -#define RCVHQ_RCV_TYPE_ERROR 3 - -/* OPA PSM assumes that the message header is always 56 bytes. */ -#define HFI_MESSAGE_HDR_SIZE 56 -#endif /* interval timing routines */ /* Convert a count of cycles to elapsed nanoseconds */ @@ -275,25 +238,6 @@ void psm3_qwordcpy_safe(volatile uint64_t *dest, const uint64_t *src, #define psm3_qwordcpy_safe psm3_qwordcpy #endif -#ifdef PSM_OPA -/* 64B move instruction support */ -#define AVX512F_BIT 16 /* level 07h, ebx */ -/* 32B move instruction support */ -#define AVX2_BIT 5 /* level 07h, ebx */ -/* 16B move instruction support */ -#define SSE2_BIT 26 /* level 01h, edx */ - -#ifdef PSM_AVX512 -void psm3_pio_blockcpy_512(volatile uint64_t *dest, - const uint64_t *src, uint32_t nblock); -#endif -void psm3_pio_blockcpy_256(volatile uint64_t *dest, - const uint64_t *src, uint32_t nblock); -void psm3_pio_blockcpy_128(volatile uint64_t *dest, - const uint64_t *src, uint32_t nblock); -void psm3_pio_blockcpy_64(volatile uint64_t *dest, - const uint64_t *src, uint32_t nblock); -#endif /* PSM_OPA */ extern uint32_t psm3_pico_per_cycle; /* only for use in these functions */ diff --git a/psm3/psm.c b/psm3/psm.c index 9c6fef0..826112c 100644 --- a/psm3/psm.c +++ b/psm3/psm.c @@ -64,12 +64,10 @@ static int psm3_verno_minor = PSM2_VERNO_MINOR; static int psm3_verno = PSMI_VERNO_MAKE(PSM2_VERNO_MAJOR, PSM2_VERNO_MINOR); static int psm3_verno_client_val; uint8_t psm3_addr_fmt; // PSM3_ADDR_FMT -#ifndef PSM_OPA int psm3_allow_routers; // PSM3_ALLOW_ROUTERS char *psm3_allow_subnets[PSMI_MAX_SUBNETS]; // PSM3_SUBNETS int psm3_num_allow_subnets; -#endif unsigned int psm3_addr_per_nic = 1; const char *psm3_nic_wildcard = NULL; @@ -583,10 +581,6 @@ int psmi_cuda_initialize() (union psmi_envvar_val)CUDA_THRESH_RNDV, &env_cuda_thresh_rndv); cuda_thresh_rndv = env_cuda_thresh_rndv.e_int; -#ifdef PSM_OPA - if (cuda_thresh_rndv > CUDA_THRESH_RNDV) - cuda_thresh_rndv = CUDA_THRESH_RNDV; -#endif union psmi_envvar_val env_gdr_copy_limit_send; psm3_getenv("PSM3_GDRCOPY_LIMIT_SEND", @@ -711,7 +705,6 @@ int psmi_oneapi_ze_initialize() } #endif // PSM_ONEAPI -#ifndef PSM_OPA /* parse PSM3_SUBNETS to get a list of subnets we'll consider */ static psm2_error_t @@ -765,7 +758,6 @@ psmi_parse_subnets(const char *subnets) return err; } -#endif static void psmi_parse_nic_var() @@ -961,7 +953,6 @@ psm2_error_t psm3_init(int *major, int *minor) } psm3_addr_fmt = env_addr_fmt.e_int; } -#ifndef PSM_OPA { union psmi_envvar_val env_addr_per_nic; psm3_getenv("PSM3_ADDR_PER_NIC", @@ -996,7 +987,6 @@ psm2_error_t psm3_init(int *major, int *minor) if ((err = psmi_parse_subnets(env_subnets.e_str))) goto fail_unref; } -#endif psmi_parse_nic_var(); diff --git a/psm3/psm2_hal.c b/psm3/psm2_hal.c index b3f5056..767de97 100644 --- a/psm3/psm2_hal.c +++ b/psm3/psm2_hal.c @@ -154,39 +154,13 @@ void psm3_hal_register_instance(psmi_hal_instance_t *psm_hi) REJECT_IMPROPER_HI(hfp_gdr_convert_gpu_to_host_addr); #endif /* PSM_CUDA || PSM_ONEAPI */ REJECT_IMPROPER_HI(hfp_get_port_index2pkey); -#ifdef PSM_OPA - REJECT_IMPROPER_HI(hfp_set_pkey); -#endif REJECT_IMPROPER_HI(hfp_poll_type); -#ifdef PSM_OPA - REJECT_IMPROPER_HI(hfp_free_tid); - REJECT_IMPROPER_HI(hfp_get_tidcache_invalidation); - REJECT_IMPROPER_HI(hfp_update_tid); - REJECT_IMPROPER_HI(hfp_tidflow_check_update_pkt_seq); - REJECT_IMPROPER_HI(hfp_tidflow_get); - REJECT_IMPROPER_HI(hfp_tidflow_get_hw); - REJECT_IMPROPER_HI(hfp_tidflow_get_seqnum); - REJECT_IMPROPER_HI(hfp_tidflow_reset); - REJECT_IMPROPER_HI(hfp_tidflow_set_entry); - REJECT_IMPROPER_HI(hfp_get_hfi_event_bits); -#endif REJECT_IMPROPER_HI(hfp_spio_transfer_frame); REJECT_IMPROPER_HI(hfp_transfer_frame); -#ifdef PSM_OPA - REJECT_IMPROPER_HI(hfp_dma_send_pending_scbs); -#endif REJECT_IMPROPER_HI(hfp_drain_sdma_completions); REJECT_IMPROPER_HI(hfp_get_node_id); -#ifdef PSM_OPA - REJECT_IMPROPER_HI(hfp_get_jkey); - REJECT_IMPROPER_HI(hfp_get_pio_size); - REJECT_IMPROPER_HI(hfp_get_pio_stall_cnt); - REJECT_IMPROPER_HI(hfp_get_subctxt); - REJECT_IMPROPER_HI(hfp_get_subctxt_cnt); - REJECT_IMPROPER_HI(hfp_get_tid_exp_cnt); -#endif #endif /* PSMI_HAL_INST_CNT > 1 || defined(PSM_DEBUG) */ @@ -616,10 +590,6 @@ static struct _psmi_hal_instance *psm3_hal_get_pi_inst(void) PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, (union psmi_envvar_val)"any", &env_hal); -#ifdef PSM_OPA - /* The hfp_get_num_units() call below, will not wait for the HFI driver - to come up and create device nodes in /dev/.) */ -#endif for (i=0; i <= PSM_HAL_INDEX_MAX; i++) { p = psm3_hal_table[i]; diff --git a/psm3/psm2_hal.h b/psm3/psm2_hal.h index 2e843c6..02ff881 100644 --- a/psm3/psm2_hal.h +++ b/psm3/psm2_hal.h @@ -59,9 +59,6 @@ /* Forward declaration of PSM structs: */ struct psm2_mq; -#ifdef PSM_OPA -struct ips_tid_session_list_tag; -#endif struct ips_recvhdrq_event; struct ips_scb_pendlist; struct ips_flow; @@ -86,9 +83,6 @@ struct psm3_ep_open_opts; */ typedef enum { -#ifdef PSM_OPA - PSM_HAL_INDEX_OPA = 0, -#endif PSM_HAL_INDEX_VERBS = 1, PSM_HAL_INDEX_SOCKETS = 2, PSM_HAL_INDEX_LOOPBACK = 3, @@ -101,9 +95,6 @@ typedef enum static inline const char* psm3_hal_index_to_str(int index) { switch (index) { -#ifdef PSM_OPA - case PSM_HAL_INDEX_OPA: return "opa"; -#endif case PSM_HAL_INDEX_VERBS: return "verbs"; case PSM_HAL_INDEX_SOCKETS: return "sockets"; case PSM_HAL_INDEX_LOOPBACK: return "loopback"; @@ -140,28 +131,7 @@ typedef enum PSM_HAL_ERROR_RESERVED_BY_HAL_API = 1000, } psmi_hal_errors; -#ifdef PSM_OPA -typedef enum -{ - PSM_HAL_HW_STATUS_INITTED = (1UL << 0), - PSM_HAL_HW_STATUS_CHIP_PRESENT = (1UL << 1), - PSM_HAL_HW_STATUS_IB_READY = (1UL << 2), - PSM_HAL_HW_STATUS_IB_CONF = (1UL << 3), - PSM_HAL_HW_STATUS_HWERROR = (1UL << 4) -} psmi_hal_hw_status; -#endif -#ifdef PSM_OPA -typedef enum -{ - PSM_HAL_HFI_EVENT_FROZEN = (1UL << 0), - PSM_HAL_HFI_EVENT_LINKDOWN = (1UL << 1), - PSM_HAL_HFI_EVENT_LID_CHANGE = (1UL << 2), - PSM_HAL_HFI_EVENT_LMC_CHANGE = (1UL << 3), - PSM_HAL_HFI_EVENT_SL2VL_CHANGE = (1UL << 4), - PSM_HAL_HFI_EVENT_TID_MMU_NOTIFY = (1UL << 5) -} psmi_hal_hfi_events; -#endif /* The following enum constants correspond to the bits in the * cap_mask member of the psmi_hal_params_t. @@ -177,47 +147,6 @@ typedef enum */ typedef enum { -#ifdef PSM_OPA - PSM_HAL_CAP_SDMA = (1UL << 0), - PSM_HAL_CAP_SDMA_AHG = (1UL << 1), - PSM_HAL_CAP_EXTENDED_PSN = (1UL << 2), - PSM_HAL_CAP_HDRSUPP = (1UL << 3), - PSM_HAL_CAP_USE_SDMA_HEAD = (1UL << 4), - PSM_HAL_CAP_MULTI_PKT_EGR = (1UL << 5), - PSM_HAL_CAP_NODROP_RHQ_FULL = (1UL << 6), - PSM_HAL_CAP_NODROP_EGR_FULL = (1UL << 7), - PSM_HAL_CAP_TID_UNMAP = (1UL << 8), - PSM_HAL_CAP_PRINT_UNIMPL = (1UL << 9), - PSM_HAL_CAP_ALLOW_PERM_JKEY = (1UL << 10), - PSM_HAL_CAP_NO_INTEGRITY = (1UL << 11), - PSM_HAL_CAP_PKEY_CHECK = (1UL << 12), - PSM_HAL_CAP_STATIC_RATE_CTRL = (1UL << 13), - PSM_HAL_CAP_SDMA_HEAD_CHECK = (1UL << 14), - PSM_HAL_CAP_EARLY_CREDIT_RETURN = (1UL << 15), - /* are any GPUDIRECT features (Copy, Send DMA or RDMA) - * currently available for the given HAL. Otherwise - * PSM3_GPUDIRECT=1 is disallowed (fatal). - * Only true if HFI driver also enabled for GPU. - * At a minimum when this is set, GPUDirect Copy must be allowed - */ - PSM_HAL_CAP_GPUDIRECT = (1UL << 16), - PSM_HAL_CAP_DMA_HSUPP_FOR_32B_MSGS = (1UL << 17), - PSM_HAL_CAP_RSM_FECN_SUPP = (1UL << 18), - PSM_HAL_CAP_MERGED_TID_CTRLS = (1UL << 19), - /* can segmentation offload (OPA Send DMA) - * handle a non 32b mult total payload length and properly - * send a odd lengthed packet in the packet sequence. - */ - PSM_HAL_CAP_NON_DW_MULTIPLE_MSG_SIZE = (1UL << 20), - /* Is GPUDIRECT RDMA (send and recv) currently available for - * the given HAL. Otherwise we ignore - * PSM3_GPUDIRECT_RDMA_SEND_LIMIT - * and PSM3_GPUDIRECT_RDMA_RECV_LIMIT. - * Upper level will only attempt to use GPUDIRECT RDMA if both - * this and PSM_HAL_CAP_GPUDIRECT are true. - */ - PSM_HAL_CAP_GPUDIRECT_RDMA = (1UL << 21), -#else /* PSM_OPA */ /* can spio_transfer_frame handle a non 32b multiple * payload length for both single packets and PIO * segmentation (UDP GSO) @@ -261,7 +190,6 @@ typedef enum */ PSM_HAL_CAP_GPUDIRECT_RDMA = (1UL << 6), -#endif /* PSM_OPA */ } psmi_hal_capability_bits; /* The following enum constants correspond to the bits in the @@ -273,10 +201,6 @@ typedef enum /* Rx thread is started. */ PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED = (1UL << 1), PSM_HAL_PSMI_RUNTIME_INTR_ENABLED = (1UL << 2), -#ifdef PSM_OPA - /* Header suppression is enabled: */ - PSM_HAL_HDRSUPP_ENABLED = (1UL << 3), -#endif PSM_HAL_PARAMS_VALID_NUM_UNITS = (1UL << 4), PSM_HAL_PARAMS_VALID_NUM_PORTS = (1UL << 5), PSM_HAL_PARAMS_VALID_DEFAULT_PKEY = (1UL << 6), @@ -319,84 +243,11 @@ typedef struct _psmi_hal_params char **unit_driver; } psmi_hal_params_t; -#ifdef PSM_OPA -#define PSM_HAL_MAX_SHARED_CTXTS 8 -#endif // PSM_OPA #define PSM_HAL_ALG_ACROSS 0 #define PSM_HAL_ALG_WITHIN 1 #define PSM_HAL_ALG_ACROSS_ALL 2 -#ifdef PSM_OPA -typedef enum -{ - PSM_HAL_EXP = 0, - PSM_HAL_EGR = 1, -} psmi_hal_set_sdma_req_type; - -#define PSM_HAL_SDMA_REQ_VERSION_MASK 0xF -#define PSM_HAL_SDMA_REQ_VERSION_SHIFT 0x0 -#define PSM_HAL_SDMA_REQ_OPCODE_MASK 0xF -#define PSM_HAL_SDMA_REQ_OPCODE_SHIFT 0x4 -#define PSM_HAL_SDMA_REQ_IOVCNT_MASK 0xFF -#define PSM_HAL_SDMA_REQ_IOVCNT_SHIFT 0x8 - -#ifdef PSM_CUDA -#define PSM_HAL_BUF_GPU_MEM 1 -#endif - -struct psm_hal_sdma_req_info { - /* - * bits 0-3 - version (currently used only for GPU direct) - * 1 - user space is NOT using flags field - * 2 - user space is using flags field - * bits 4-7 - opcode (enum sdma_req_opcode) - * bits 8-15 - io vector count - */ - __u16 ctrl; - /* - * Number of fragments contained in this request. - * User-space has already computed how many - * fragment-sized packet the user buffer will be - * split into. - */ - __u16 npkts; - /* - * Size of each fragment the user buffer will be - * split into. - */ - __u16 fragsize; - /* - * Index of the slot in the SDMA completion ring - * this request should be using. User-space is - * in charge of managing its own ring. - */ - __u16 comp_idx; -#ifdef PSM_CUDA - /* - * Buffer flags for this request. See HFI1_BUF_* - */ - __u16 flags; - /* The extra bytes for the PSM_CUDA version of the sdma req info - * struct is the size of the flags member. */ -#define PSM_HAL_CUDA_SDMA_REQ_INFO_EXTRA sizeof(__u16) -#endif -} __attribute__((packed)); - - -typedef enum { - PSM_HAL_SDMA_RING_AVAILABLE = 0, - PSM_HAL_SDMA_RING_QUEUED = 1, - PSM_HAL_SDMA_RING_COMPLETE = 2, - PSM_HAL_SDMA_RING_ERROR = 3, -} psmi_hal_sdma_ring_slot_status; - -struct psm_hal_pbc { - __le32 pbc0; - __le16 PbcStaticRateControlCnt; - __le16 fill1; -}; -#endif // PSM_OPA typedef enum { PSMI_HAL_POLL_TYPE_URGENT = 1 @@ -533,9 +384,7 @@ struct _psmi_hal_instance void (*hfp_ips_ipsaddr_init_addressing)(struct ips_proto *proto, psm2_epid_t epid, ips_epaddr_t *ipsaddr, uint16_t *lidp -#ifndef PSM_OPA , psmi_gid128_t *gidp -#endif ); psm2_error_t (*hfp_ips_ipsaddr_init_connections)( struct ips_proto *proto, @@ -563,58 +412,8 @@ struct _psmi_hal_instance * corresponding pkey for the index as programmed by the SM */ /* Returns an int, so -1 indicates an error. */ int (*hfp_get_port_index2pkey)(psm2_ep_t ep, int index); -#ifdef PSM_OPA - int (*hfp_set_pkey)(psmi_hal_hw_context, uint16_t); -#endif // PSM_OPA int (*hfp_poll_type)(uint16_t poll_type, psm2_ep_t ep); -#ifdef PSM_OPA - int (*hfp_free_tid)(psmi_hal_hw_context, uint64_t tidlist, uint32_t tidcnt); - int (*hfp_get_tidcache_invalidation)(psmi_hal_hw_context, uint64_t tidlist, uint32_t *tidcnt); - int (*hfp_update_tid)(psmi_hal_hw_context, uint64_t vaddr, uint32_t *length, - uint64_t tidlist, uint32_t *tidcnt, - uint16_t flags); - /* Start of tid flow functions. */ - int (*hfp_tidflow_check_update_pkt_seq)(void *vpprotoexp - /* actually a: - struct ips_protoexp *protoexp */, - psmi_seqnum_t sequence_num, - void *vptidrecvc - /* actually a: - struct ips_tid_recv_desc *tidrecvc */, - struct ips_message_header *p_hdr, - void (*ips_protoexp_do_tf_generr) - (void *vpprotoexp - /* actually a: - struct ips_protoexp *protoexp */, - void *vptidrecvc - /* actually a: - struct ips_tid_recv_desc *tidrecvc */, - struct ips_message_header *p_hdr), - void (*ips_protoexp_do_tf_seqerr) - (void *vpprotoexp - /* actually a: - struct ips_protoexp *protoexp */, - void *vptidrecvc - /* actually a: - struct ips_tid_recv_desc *tidrecvc */, - struct ips_message_header *p_hdr) - ); - int (*hfp_tidflow_get)(uint32_t flowid, uint64_t *ptf, psmi_hal_hw_context); - - /* hfp_tidflow_get_hw is identical to hfp_tidflow_get(), but guarantees to get - its information fron h/w, and not from cached values, but may be significantly - slower than hfp_tidflow_get(), so should be used for debug only. */ - int (*hfp_tidflow_get_hw)(uint32_t flowid, uint64_t *ptf, psmi_hal_hw_context); - int (*hfp_tidflow_get_seqnum)(uint64_t val, uint32_t *pseqn); - int (*hfp_tidflow_reset)(psmi_hal_hw_context, uint32_t flowid, uint32_t genval, - uint32_t seqnum); - int (*hfp_tidflow_set_entry)(uint32_t flowid, uint32_t genval, - uint32_t seqnum, psmi_hal_hw_context); - /* End of tid flow functions. */ - - int (*hfp_get_hfi_event_bits) (uint64_t *event_bits, psmi_hal_hw_context); -#endif /* PSM_OPA */ psm2_error_t (*hfp_spio_transfer_frame)(struct ips_proto *proto, struct ips_flow *flow, struct ips_scb *scb, @@ -634,22 +433,9 @@ struct _psmi_hal_instance , uint32_t is_gpu_payload #endif ); -#ifdef PSM_OPA - psm2_error_t (*hfp_dma_send_pending_scbs)(struct ips_proto *proto, - struct ips_flow *flow, struct ips_scb_pendlist *slist, - int *num_sent); -#endif psm2_error_t (*hfp_drain_sdma_completions)(struct ips_proto *proto); int (*hfp_get_node_id)(int unit, int *nodep); -#ifdef PSM_OPA - int (*hfp_get_jkey)(psm2_ep_t); - int (*hfp_get_pio_size)(psmi_hal_hw_context); - int (*hfp_get_pio_stall_cnt)(psmi_hal_hw_context, uint64_t **); - int (*hfp_get_subctxt)(psmi_hal_hw_context); - int (*hfp_get_subctxt_cnt)(psmi_hal_hw_context); - int (*hfp_get_tid_exp_cnt)(psmi_hal_hw_context); -#endif /* PSM_OPA */ #endif /* PSMI_HAL_INST_CNT > 1 || defined(PSM_DEBUG) */ }; @@ -781,41 +567,14 @@ int psm3_hal_pre_init_cache_func(enum psmi_hal_pre_init_cache_func_krnls k, ...) #endif /* PSM_CUDA || PSM_ONEAPI */ #define psmi_hal_get_port_index2pkey(...) PSMI_HAL_DISPATCH(get_port_index2pkey,__VA_ARGS__) -#ifdef PSM_OPA -#define psmi_hal_set_pkey(...) PSMI_HAL_DISPATCH(set_pkey,__VA_ARGS__) -#endif // PSM_OPA #define psmi_hal_poll_type(...) PSMI_HAL_DISPATCH(poll_type,__VA_ARGS__) -#ifdef PSM_OPA -#define psmi_hal_free_tid(...) PSMI_HAL_DISPATCH(free_tid,__VA_ARGS__) -#define psmi_hal_get_tidcache_invalidation(...) PSMI_HAL_DISPATCH(get_tidcache_invalidation,__VA_ARGS__) -#define psmi_hal_update_tid(...) PSMI_HAL_DISPATCH(update_tid,__VA_ARGS__) -#define psmi_hal_tidflow_check_update_pkt_seq(...) PSMI_HAL_DISPATCH(tidflow_check_update_pkt_seq,__VA_ARGS__) -#define psmi_hal_tidflow_get(...) PSMI_HAL_DISPATCH(tidflow_get,__VA_ARGS__) -#define psmi_hal_tidflow_get_hw(...) PSMI_HAL_DISPATCH(tidflow_get_hw,__VA_ARGS__) -#define psmi_hal_tidflow_get_seqnum(...) PSMI_HAL_DISPATCH(tidflow_get_seqnum,__VA_ARGS__) -#define psmi_hal_tidflow_reset(...) PSMI_HAL_DISPATCH(tidflow_reset,__VA_ARGS__) -#define psmi_hal_tidflow_set_entry(...) PSMI_HAL_DISPATCH(tidflow_set_entry,__VA_ARGS__) -#define psmi_hal_get_hfi_event_bits(...) PSMI_HAL_DISPATCH(get_hfi_event_bits,__VA_ARGS__) -#endif // PSM_OPA #define psmi_hal_spio_transfer_frame(...) PSMI_HAL_DISPATCH(spio_transfer_frame,__VA_ARGS__) #define psmi_hal_transfer_frame(...) PSMI_HAL_DISPATCH(transfer_frame,__VA_ARGS__) -#ifdef PSM_OPA -#define psmi_hal_dma_send_pending_scbs(...) PSMI_HAL_DISPATCH(dma_send_pending_scbs,__VA_ARGS__) -#endif #define psmi_hal_drain_sdma_completions(...) PSMI_HAL_DISPATCH(drain_sdma_completions,__VA_ARGS__) #define psmi_hal_get_node_id(...) PSMI_HAL_DISPATCH(get_node_id,__VA_ARGS__) -#ifdef PSM_OPA -#define psmi_hal_get_jkey(...) PSMI_HAL_DISPATCH(get_jkey,__VA_ARGS__) -#define psmi_hal_get_pio_size(...) PSMI_HAL_DISPATCH(get_pio_size,__VA_ARGS__) -#define psmi_hal_get_pio_stall_cnt(...) PSMI_HAL_DISPATCH(get_pio_stall_cnt,__VA_ARGS__) -#define psmi_hal_get_subctxt(...) PSMI_HAL_DISPATCH(get_subctxt,__VA_ARGS__) -#define psmi_hal_get_subctxt_cnt(...) PSMI_HAL_DISPATCH(get_subctxt_cnt,__VA_ARGS__) -#define psmi_hal_get_tid_exp_cnt(...) PSMI_HAL_DISPATCH(get_tid_exp_cnt,__VA_ARGS__) - -#endif // PSM_OPA #define psmi_hal_get_hal_instance_index() psm3_hal_current_hal_instance->hal_index #define psmi_hal_get_hal_instance_name() psm3_hal_index_to_str(psm3_hal_current_hal_instance->hal_index) diff --git a/psm3/psm2_hal_inline_t.h b/psm3/psm2_hal_inline_t.h index e0edce0..960b3dd 100644 --- a/psm3/psm2_hal_inline_t.h +++ b/psm3/psm2_hal_inline_t.h @@ -100,9 +100,7 @@ static PSMI_HAL_INLINE void PSMI_HAL_CAT_INL_SYM(ips_proto_build_connect_message static PSMI_HAL_INLINE void PSMI_HAL_CAT_INL_SYM(ips_ipsaddr_init_addressing) (struct ips_proto *proto, psm2_epid_t epid, ips_epaddr_t *ipsaddr, uint16_t *lidp -#ifndef PSM_OPA , psmi_gid128_t *gidp -#endif ); static PSMI_HAL_INLINE psm2_error_t PSMI_HAL_CAT_INL_SYM(ips_ipsaddr_init_connections) @@ -136,61 +134,8 @@ static PSMI_HAL_INLINE void* PSMI_HAL_CAT_INL_SYM(gdr_convert_gpu_to_host_addr) #endif /* PSM_CUDA || PSM_ONEAPI */ static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_port_index2pkey) (psm2_ep_t ep, int index); -#ifdef PSM_OPA -static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(set_pkey) - (psmi_hal_hw_context, uint16_t); -#endif static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(poll_type) (uint16_t, psm2_ep_t ep); -#ifdef PSM_OPA -static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(free_tid) - (psmi_hal_hw_context, uint64_t tidlist, uint32_t tidcnt); -static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_tidcache_invalidation) - (psmi_hal_hw_context, uint64_t tidlist, uint32_t *tidcnt); -static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(update_tid) - (psmi_hal_hw_context, uint64_t vaddr, uint32_t *length, - uint64_t tidlist, uint32_t *tidcnt, - uint16_t flags); -static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(tidflow_check_update_pkt_seq) - (void *vpprotoexp - /* actually a: - struct ips_protoexp *protoexp */, - psmi_seqnum_t sequence_num, - void *vptidrecvc - /* actually a: - struct ips_tid_recv_desc *tidrecvc */, - struct ips_message_header *p_hdr, - void (*ips_protoexp_do_tf_generr) - (void *vpprotoexp - /* actually a: - struct ips_protoexp *protoexp */, - void *vptidrecvc - /* actually a: - struct ips_tid_recv_desc *tidrecvc */, - struct ips_message_header *p_hdr), - void (*ips_protoexp_do_tf_seqerr) - (void *vpprotoexp - /* actually a: - struct ips_protoexp *protoexp */, - void *vptidrecvc - /* actually a: - struct ips_tid_recv_desc *tidrecvc */, - struct ips_message_header *p_hdr)); -static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(tidflow_get) - (uint32_t flowid, uint64_t *ptf, psmi_hal_hw_context); -static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(tidflow_get_hw) - (uint32_t flowid, uint64_t *ptf, psmi_hal_hw_context); -static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(tidflow_get_seqnum) - (uint64_t val, uint32_t *pseqn); -static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(tidflow_reset) - (psmi_hal_hw_context, uint32_t flowid, uint32_t genval, - uint32_t seqnum); -static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(tidflow_set_entry) - (uint32_t flowid, uint32_t genval, uint32_t seqnum, - psmi_hal_hw_context); -static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_hfi_event_bits) - (uint64_t *event_bits, psmi_hal_hw_context); -#endif /* PSM_OPA */ static PSMI_HAL_INLINE psm2_error_t PSMI_HAL_CAT_INL_SYM(spio_transfer_frame) (struct ips_proto *proto, @@ -212,31 +157,10 @@ static PSMI_HAL_INLINE psm2_error_t PSMI_HAL_CAT_INL_SYM(transfer_frame) , uint32_t is_gpu_payload #endif ); -#ifdef PSM_OPA -static PSMI_HAL_INLINE psm2_error_t PSMI_HAL_CAT_INL_SYM(dma_send_pending_scbs) - (struct ips_proto *proto, - struct ips_flow *flow, struct ips_scb_pendlist *slist, - int *num_sent); -#endif static PSMI_HAL_INLINE psm2_error_t PSMI_HAL_CAT_INL_SYM(drain_sdma_completions) (struct ips_proto *proto); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_node_id) (int unit, int *nodep); -#ifdef PSM_OPA -static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_jkey) - (psm2_ep_t ep); -static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_pio_size) - (psmi_hal_hw_context ctxt); -static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_pio_stall_cnt) - (psmi_hal_hw_context, - uint64_t **); -static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_subctxt) - (psmi_hal_hw_context ctxt); -static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_subctxt_cnt) - (psmi_hal_hw_context ctxt); -static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_tid_exp_cnt) - (psmi_hal_hw_context ctxt); -#endif #endif /* _PSM2_HAL_INLINE_T_H_ */ diff --git a/psm3/psm_config.h b/psm3/psm_config.h index f4cc9be..eeedfe4 100644 --- a/psm3/psm_config.h +++ b/psm3/psm_config.h @@ -144,11 +144,7 @@ /* XXX TODO: Getting the gpu page size from driver at init time */ #define PSMI_GPU_PAGESIZE 65536 -#ifdef PSM_OPA -#define GDR_COPY_LIMIT_SEND 32 -#else #define GDR_COPY_LIMIT_SEND 128 -#endif #define GDR_COPY_LIMIT_RECV 64000 #elif defined(PSM_ONEAPI) @@ -165,11 +161,7 @@ /* All GPU transfers beyond this threshold use * RNDV protocol. It is mostly a send side knob. */ -#ifdef PSM_OPA -#define CUDA_THRESH_RNDV 32768 -#else #define CUDA_THRESH_RNDV 8000 -#endif #define GPUDIRECT_THRESH_RV 3 diff --git a/psm3/psm_context.c b/psm3/psm_context.c index 306431c..5b17f41 100644 --- a/psm3/psm_context.c +++ b/psm3/psm_context.c @@ -91,38 +91,6 @@ int psm3_context_interrupt_isenabled(psm2_ep_t ep) return psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_INTR_ENABLED); } -#ifdef PSM_OPA -/* Returns 1 when all of the active units have their free contexts - * equal the number of contexts. This is an indication that no - * jobs are currently running. - * - * Note that this code is clearly racy (this code may happen concurrently - * by two or more processes, and this point of observation, - * occurs earlier in time to when the decision is made for deciding which - * context to assign, which will also occurs earlier in time to when the - * context is actually assigned. And, when the context is finally - * assigned, this will change the "nfreectxts" observed below.) - */ -static int psmi_all_active_units_have_max_freecontexts(int nunits) -{ - int u; - - for (u=0;u < nunits;u++) - { - if (psmi_hal_get_unit_active(u) > 0) - { - int nfreectxts=psmi_hal_get_num_free_contexts(u), - nctxts=psmi_hal_get_num_contexts(u); - if (nfreectxts > 0 && nctxts > 0) - { - if (nfreectxts != nctxts) - return 0; - } - } - } - return 1; -} -#endif /* returns the 8-bit hash value of an uuid. */ static inline @@ -188,21 +156,7 @@ static void psmi_spread_nic_selection(psm2_uuid_t const job_key, long *unit_start, long *unit_end, int nunits) { -#ifdef PSM_OPA - /* if the number of ranks on the host is 1 and ... */ - if ((psm3_get_mylocalrank_count() == 1) && - /* - * All of the active units have free contexts equal the - * number of contexts. - */ - psmi_all_active_units_have_max_freecontexts(nunits)) { - /* we start looking at unit 0, and end at nunits-1: */ - *unit_start = 0; - *unit_end = nunits - 1; - } else { -#else { -#endif int found, saved_hfis[nunits]; /* else, we are going to look at: @@ -432,7 +386,6 @@ psmi_compute_start_and_end_unit(long unit_param, long addr_index, /* if the user did not set PSM3_NIC then ... */ if (unit_param == PSM3_NIC_ANY) { -#ifndef PSM_OPA if (nunitsactive > 1) { // if NICs are on different planes (non-routed subnets) // we need to have all ranks default to the same plane @@ -463,7 +416,6 @@ psmi_compute_start_and_end_unit(long unit_param, long addr_index, } } } -#endif /* Get the actual selection algorithm from the environment: */ nic_sel_alg = psmi_parse_nic_selection_algorithm(); @@ -822,16 +774,10 @@ psm3_ep_verify_pkey(psm2_ep_t ep, uint16_t pkey, uint16_t *opkey, uint16_t* oind err = psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, "Can't get a valid pkey value from pkey table on %s port %u\n", ep->dev_name, ep->portnum); return err; -#ifdef PSM_OPA // allow 0x7fff and 0xffff - } else if ((ret & 0x7fff) == 0x7fff) { - continue; /* management pkey, not for app traffic. */ -#endif } -#ifndef PSM_OPA // pkey == 0 means just get slot 0 if (! pkey && ! i) break; -#endif if ((pkey & 0x7fff) == (uint16_t)(ret & 0x7fff)) { break; } diff --git a/psm3/psm_context.h b/psm3/psm_context.h index 2fd955a..5210fe8 100644 --- a/psm3/psm_context.h +++ b/psm3/psm_context.h @@ -60,32 +60,6 @@ #ifndef _PSM_CONTEXT_H #define _PSM_CONTEXT_H -#ifdef PSM_OPA -typedef -struct psmi_context { -#ifdef PSM_OPA - /* The following three member variables are used for sharing contexts among - subcontexts and they have the following common properties: - - a. They are all initialized below HAL layer when the context is opened. - b. If they are NULL that means no context is being shared among subcontexts, - non-NULL means a context is being shared among some number of subcontexts. - c. The initialization code is currently found in the gen1 hal instance. - */ - void *spio_ctrl; - void *tid_ctrl; - void *tf_ctrl; /* ips_tf_ctrl in shared memory */ - /* end of shared context member variables. */ -#endif - - psmi_hal_hw_context psm_hw_ctxt; - - psm2_ep_t ep; /* psm ep handle */ - psm2_epid_t epid; /* psm integral ep id */ - psm2_error_t status_lasterr; - time_t networkLostTime; -} psmi_context_t; -#endif psm2_error_t psm3_context_open(const psm2_ep_t ep, long unit_id, long port, long addr_index, diff --git a/psm3/psm_ep.c b/psm3/psm_ep.c index 8479209..92f45d7 100644 --- a/psm3/psm_ep.c +++ b/psm3/psm_ep.c @@ -703,10 +703,6 @@ psm3_ep_open_internal(psm2_uuid_t const unique_job_key, int *devid_enabled, opts.outsl = opts_i->outsl; if (opts_i->service_id) opts.service_id = (uint64_t) opts_i->service_id; -#ifdef PSM_OPA - if (opts_i->path_res_type != PSM2_PATH_RES_NONE) - opts.path_res_type = opts_i->path_res_type; -#endif if (opts_i->senddesc_num) opts.senddesc_num = opts_i->senddesc_num; if (opts_i->imm_size) @@ -714,11 +710,7 @@ psm3_ep_open_internal(psm2_uuid_t const unique_job_key, int *devid_enabled, /* Get Service ID from environment */ if (!psm3_getenv("PSM3_IB_SERVICE_ID", -#ifdef PSM_OPA - "Service ID for path resolution", -#else "Service ID for RV module RC QP connection establishment", -#endif PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_ULONG_FLAGS, // FLAGS only affects output: hex (union psmi_envvar_val)HFI_DEFAULT_SERVICE_ID, @@ -726,33 +718,7 @@ psm3_ep_open_internal(psm2_uuid_t const unique_job_key, int *devid_enabled, opts.service_id = (uint64_t) envvar_val.e_ulonglong; } -#ifdef PSM_OPA - /* Get Path resolution type from environment Possible choices are: - * - * NONE : Default same as previous instances. Utilizes static data. - * OPP : Use OFED Plus Plus library to do path record queries. - * UMAD : Use raw libibumad interface to form and process path records. - */ - if (!psm3_getenv("PSM3_PATH_REC", - "Mechanism to query NIC path record (default is no path query)", - PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, - (union psmi_envvar_val)"none", &envvar_val)) { - if (!strcasecmp(envvar_val.e_str, "none")) - opts.path_res_type = PSM2_PATH_RES_NONE; - else if (!strcasecmp(envvar_val.e_str, "opp")) - opts.path_res_type = PSM2_PATH_RES_OPP; - else if (!strcasecmp(envvar_val.e_str, "umad")) - opts.path_res_type = PSM2_PATH_RES_UMAD; - else { - _HFI_ERROR("Unknown path resolution type %s. " - "Disabling use of path record query.\n", - envvar_val.e_str); - opts.path_res_type = PSM2_PATH_RES_NONE; - } - } -#else opts.path_res_type = PSM2_PATH_RES_NONE; -#endif /* Get user specified port number to use. */ if (!psm3_getenv("PSM3_NIC_PORT", "NIC Port number (0 autodetects)", @@ -765,11 +731,7 @@ psm3_ep_open_internal(psm2_uuid_t const unique_job_key, int *devid_enabled, /* Get service level from environment, path-query overrides it */ if (!psm3_getenv ("PSM3_NIC_SL", "NIC outging ServiceLevel number (default 0)", -#ifdef PSM_OPA - PSMI_ENVVAR_LEVEL_USER, -#else PSMI_ENVVAR_LEVEL_HIDDEN, -#endif PSMI_ENVVAR_TYPE_LONG, (union psmi_envvar_val)PSMI_SL_DEFAULT, &envvar_val)) { opts.outsl = envvar_val.e_long; @@ -781,11 +743,7 @@ psm3_ep_open_internal(psm2_uuid_t const unique_job_key, int *devid_enabled, */ if (!psm3_getenv("PSM3_PKEY", "PKey to use for endpoint (0=use slot 0)", -#ifdef PSM_OPA - PSMI_ENVVAR_LEVEL_USER, -#else PSMI_ENVVAR_LEVEL_HIDDEN, -#endif PSMI_ENVVAR_TYPE_ULONG_FLAGS, // show in hex (union psmi_envvar_val)((unsigned int)(psmi_hal_get_default_pkey())), &envvar_val)) { @@ -1225,7 +1183,6 @@ psm3_ep_open(psm2_uuid_t const unique_job_key, int j; psmi_hal_context_initstats(ep); -#ifndef PSM_OPA union psmi_envvar_val envvar_val; if (num_rails <= 0) { @@ -1257,10 +1214,6 @@ psm3_ep_open(psm2_uuid_t const unique_job_key, } for (j= 0; j< envvar_val.e_uint; j++) { -#else - j=0; - { -#endif for (i = 0; i < num_rails; i++) { _HFI_VDBG("rail %d unit %u port %u addr_index %d\n", i, units[i], ports[i], addr_indexes[i]); // did 0, 0 already above diff --git a/psm3/psm_ep.h b/psm3/psm_ep.h index a5a4cb3..9241510 100644 --- a/psm3/psm_ep.h +++ b/psm3/psm_ep.h @@ -60,18 +60,12 @@ #ifndef _PSMI_EP_H #define _PSMI_EP_H -#ifdef PSM_OPA -#if defined(PSM_VERBS) || defined(PSM_SOCKETS) -#error "PSM_OPA not allowed with PSM_VERBS and/or PSM_SOCKETS" -#endif -#else // PSM_OPA #if !defined(PSM_VERBS) && !defined(PSM_SOCKETS) && !defined(PSM_NONE) #error "At least one of PSM_VERBS or PSM_SOCKETS must be defined" #endif #if defined(PSM_VERBS) && defined(PSM_SOCKETS) && defined(UMR_CACHE) #error "UMR_CACHE not yet allowed with both PSM_VERBS and PSM_SOCKETS enabled" #endif -#endif // PSM_OPA #ifdef PSM_VERBS #include "hal_verbs/verbs_ep.h" @@ -88,23 +82,10 @@ /* any port num to match. */ #define PSM3_NIC_PORT_ANY ((long)0) -#ifdef PSM_OPA -#define PSMI_HFI_TYPE_UNKNOWN 0 -#define PSMI_HFI_TYPE_OPA1 1 -#define PSMI_HFI_TYPE_OPA2 2 -#endif #define PSMI_SL_DEFAULT 0 #define PSMI_SL_MIN 0 #define PSMI_SL_MAX 31 -#ifdef PSM_OPA -#define PSMI_SC_DEFAULT 0 -#define PSMI_VL_DEFAULT 0 -#define PSMI_SC_ADMIN 15 -#define PSMI_VL_ADMIN 15 -#define PSMI_SC_NBITS 5 /* Number of bits in SC */ -#define PSMI_N_SCS (1 << PSMI_SC_NBITS) /* The number of SC's */ -#endif #define PSM_MCTXT_APPEND(head, node) \ node->mctxt_prev = head->mctxt_prev; \ @@ -128,10 +109,6 @@ struct psm2_ep { #endif #ifdef PSM_SOCKETS struct psm3_sockets_ep sockets_ep; -#endif -#ifdef PSM_OPA - /* OPA specific device pointer */ - psmi_context_t context; #endif }; @@ -177,9 +154,6 @@ struct psm2_ep { * Note UDP vs TCP are separate EPID protocols */ uint8_t rdmamode; /* PSM3_RDMA */ -#ifdef PSM_OPA - /* PSM3_TID (OPA100) */ -#endif #ifdef PSM_HAVE_REG_MR /* per EP information needed to create verbs MR cache */ uint8_t mr_cache_mode; /** PSM3_MR_CACHE_MODE */ @@ -191,7 +165,6 @@ struct psm2_ep { uint32_t hfi_imm_size; /** Immediate data size */ uint32_t connections; /**> Number of connections */ -#ifndef PSM_OPA /* HAL indicates send segmentation support (OPA Send DMA or UDP GSO) * by setting max_segs>1 and max_size > 1 MTU. * chunk_size used will be min(chunk_max_segs*frag_size, chunk_max_size) @@ -201,7 +174,6 @@ struct psm2_ep { */ uint16_t chunk_max_segs; /* max fragments in 1 HAL send call */ uint32_t chunk_max_size; /* max payload in 1 HAL send call */ -#endif char *context_mylabel; uint32_t yield_spin_cnt; diff --git a/psm3/psm_ep_connect.c b/psm3/psm_ep_connect.c index 3d8cc56..55d698d 100644 --- a/psm3/psm_ep_connect.c +++ b/psm3/psm_ep_connect.c @@ -130,12 +130,6 @@ psm3_ep_connect(psm2_ep_t ep, int num_of_epid, psm2_epid_t const *array_of_epid, array_of_errors[j] = PSM2_EPID_UNKNOWN; array_of_epaddr[j] = NULL; if (psm3_epid_addr_fmt(array_of_epid[j]) != ep->addr_fmt) { -#ifdef PSM_OPA - psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, - " Unknown version of EPID - %u\n" - "Please upgrade PSM3 or set PSM3_ADDR_FMT=1 in the environment to force EPID version 1 \n", - psm3_epid_addr_fmt(array_of_epid[j])); -#else /* PSM_OPA */ psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, " Mismatched address format: remote EP (%s): %s (%u) Local EP: %s (%u)\n" "Confirm all nodes are running the same interconnect HW, addressing format and PSM version\n", @@ -144,9 +138,7 @@ psm3_ep_connect(psm2_ep_t ep, int num_of_epid, psm2_epid_t const *array_of_epid, psm3_epid_addr_fmt(array_of_epid[j]), psm3_epid_str_addr_fmt(ep->epid), ep->addr_fmt); -#endif } -#ifndef PSM_OPA if (psm3_epid_protocol(array_of_epid[j]) != psm3_epid_protocol(ep->epid)) { psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, " Mismatched protocol: remote EP (%s): %s (%u) Local EP: %s (%u)\n" @@ -157,7 +149,6 @@ psm3_ep_connect(psm2_ep_t ep, int num_of_epid, psm2_epid_t const *array_of_epid, psm3_epid_str_protocol(ep->epid), psm3_epid_protocol(ep->epid)); } -#endif /* PSM_OPA */ num_toconnect++; } epid_mask_isdupof[j] = -1; diff --git a/psm3/psm_error.c b/psm3/psm_error.c index f7d6fdc..69d362f 100644 --- a/psm3/psm_error.c +++ b/psm3/psm_error.c @@ -294,11 +294,7 @@ struct psmi_error_item psmi_error_items[] = { {PSMI_NOLOG, "Endpoint was already connected"}, /* PSM2_EPID_ALREADY_CONNECTED = 48 */ {LOG_CRIT, "Two or more endpoints have the same network id (LID)"}, /* PSM2_EPID_NETWORK_ERROR = 49 */ {LOG_CRIT, "Endpoint provided incompatible Partition Key"}, -#ifdef PSM_OPA - {LOG_CRIT, "Unable to resolve network path. Is the SM running?"}, -#else {LOG_CRIT, "Unable to resolve network path. Check connectivity and routing between nodes"}, -#endif {LOG_CRIT, "Unable to establish RV RC QP connection"}, /* PSM2_EPID_RV_CONNECT_ERROR */ {LOG_INFO, "Recovering RV RC QP connection"}, /* PSM2_EPID_RV_CONNECT_RECOVERING */ {PSMI_NOLOG, "unknown 54"}, diff --git a/psm3/psm_mq.c b/psm3/psm_mq.c index fd3539b..4b9cc08 100644 --- a/psm3/psm_mq.c +++ b/psm3/psm_mq.c @@ -783,20 +783,11 @@ psm3_mq_irecv_inner(psm2_mq_t mq, psm2_mq_req_t req, void *buf, uint32_t len) * any more than copysz. After that, swap system with user buffer */ req->recv_msgoff = min(req->recv_msgoff, msglen); -#ifdef PSM_OPA - psm3_mq_recv_copy(mq, req, -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - req->is_buf_gpu_mem, -#endif - buf, req->req_data.send_msglen, - req->recv_msgoff); -#else psm3_mq_recv_copy(mq, req, #if defined(PSM_CUDA) || defined(PSM_ONEAPI) req->is_buf_gpu_mem, #endif buf, len, req->recv_msgoff); -#endif psm3_mq_sysbuf_free(mq, req->req_data.buf); req->state = MQ_STATE_MATCHED; @@ -811,16 +802,11 @@ psm3_mq_irecv_inner(psm2_mq_t mq, psm2_mq_req_t req, void *buf, uint32_t len) */ req->recv_msgoff = min(req->recv_msgoff, msglen); if (req->send_msgoff) { // only have sysbuf if RTS w/payload -#ifdef PSM_OPA - psm3_mq_mtucpy(buf, (const void *)req->req_data.buf, - req->recv_msgoff); -#else psm3_mq_recv_copy(mq, req, #if defined(PSM_CUDA) || defined(PSM_ONEAPI) req->is_buf_gpu_mem, #endif buf, len, req->recv_msgoff); -#endif psm3_mq_sysbuf_free(mq, req->req_data.buf); } diff --git a/psm3/psm_mq_recv.c b/psm3/psm_mq_recv.c index f75e64e..a8e9e8d 100644 --- a/psm3/psm_mq_recv.c +++ b/psm3/psm_mq_recv.c @@ -257,16 +257,7 @@ psm3_mq_handle_data(psm2_mq_t mq, psm2_mq_req_t req, } if (req->state == MQ_STATE_MATCHED) { -#if 0 && defined(PSM_HAVE_REG_MR) - // this is a bit paranoid, if the receiver selected LONG_DATA in CTS - // it will not have registered an MR - if (req->mr) { - _HFI_MMDBG("LONG_DATA recv complete, releasing MR: rkey: 0x%x\n", req->mr->rkey); - psm3_verbs_release_mr(req->mr); - req->mr = NULL; - ips_tid_mravail_callback(req->rts_peer->proto); - } -#elif defined(PSM_HAVE_REG_MR) +#if defined(PSM_HAVE_REG_MR) psmi_assert(! req->mr); #endif req->state = MQ_STATE_COMPLETE; @@ -848,17 +839,12 @@ int psm3_mq_handle_outoforder(psm2_mq_t mq, psm2_mq_req_t ureq) switch (ureq->state) { case MQ_STATE_COMPLETE: if (ureq->req_data.buf != NULL) { /* 0-byte don't alloc a sysreq_data.buf */ -#ifdef PSM_OPA - psm3_mq_mtucpy(ereq->req_data.buf, (const void *)ureq->req_data.buf, - msglen); -#else psm3_mq_recv_copy(mq, ureq, #if defined(PSM_CUDA) || defined(PSM_ONEAPI) ereq->is_buf_gpu_mem, #endif ereq->req_data.buf, ereq->req_data.buf_len, msglen); -#endif psm3_mq_sysbuf_free(mq, ureq->req_data.buf); #if defined(PSM_CUDA) || defined(PSM_ONEAPI) } else { @@ -875,20 +861,12 @@ int psm3_mq_handle_outoforder(psm2_mq_t mq, psm2_mq_req_t ureq) ereq->ptl_req_ptr = ureq->ptl_req_ptr; ereq->send_msgoff = ureq->send_msgoff; ereq->recv_msgoff = min(ureq->recv_msgoff, msglen); -#ifdef PSM_OPA - if (ereq->recv_msgoff) { - psm3_mq_mtucpy(ereq->req_data.buf, - (const void *)ureq->req_data.buf, - ereq->recv_msgoff); - } -#else psm3_mq_recv_copy(mq, ureq, #if defined(PSM_CUDA) || defined(PSM_ONEAPI) ereq->is_buf_gpu_mem, #endif ereq->req_data.buf, ereq->req_data.buf_len, ereq->recv_msgoff); -#endif psm3_mq_sysbuf_free(mq, ureq->req_data.buf); ereq->type = ureq->type; STAILQ_INSERT_AFTER(&mq->eager_q, ureq, ereq, nextq); @@ -901,11 +879,6 @@ int psm3_mq_handle_outoforder(psm2_mq_t mq, psm2_mq_req_t ureq) ereq->send_msgoff = ureq->send_msgoff; ereq->recv_msgoff = min(ureq->recv_msgoff, msglen); if (ereq->send_msgoff) { // only have sysbuf if RTS w/payload -#ifdef PSM_OPA - psm3_mq_mtucpy(ereq->req_data.buf, - (const void *)ureq->req_data.buf, - ereq->recv_msgoff); -#else psm3_mq_recv_copy(mq, ureq, #if defined(PSM_CUDA) || defined(PSM_ONEAPI) ereq->is_buf_gpu_mem, @@ -913,7 +886,6 @@ int psm3_mq_handle_outoforder(psm2_mq_t mq, psm2_mq_req_t ureq) ereq->req_data.buf, ereq->req_data.buf_len, ereq->recv_msgoff); -#endif psm3_mq_sysbuf_free(mq, ureq->req_data.buf); } ereq->rts_callback = ureq->rts_callback; diff --git a/psm3/psm_netutils.h b/psm3/psm_netutils.h index 3aa133a..eff1115 100644 --- a/psm3/psm_netutils.h +++ b/psm3/psm_netutils.h @@ -108,7 +108,6 @@ typedef psmi_qual_netaddr128_addr_t psmi_naddr128_t; // IPv6: 128b subnet_prefix typedef psmi_qual_netaddr128_addr_t psmi_subnet128_t; -#ifndef PSM_OPA static inline psmi_bare_netaddr128_t psmi_bare_netaddr128_and( psmi_bare_netaddr128_t a, psmi_bare_netaddr128_t b) { @@ -146,7 +145,6 @@ static inline int && __be32_to_cpu(s->sin6_addr.s6_addr32[2]) == (gid.lo >> 32) && __be32_to_cpu(s->sin6_addr.s6_addr32[3]) == (gid.lo & 0xffffffff)); } -#endif /* ! PSM_OPA */ // PSM3_ADDR_FMT sets this value, default of PSMI_ADDR_FMT_DEFAULT @@ -161,18 +159,6 @@ extern uint8_t psm3_addr_fmt; // PSM3_ADDR_FMT extern unsigned int psm3_addr_per_nic; #define PSMI_ADDR_FMT_SHM 0 // shm-only or self-only -#ifdef PSM_OPA -#define PSMI_ADDR_FMT_IPATH 1 // iPath -#define PSMI_ADDR_FMT_OPA 2 // OPA - -#define PSMI_MAX_ADDR_FMT_SUPPORTED 2 -#define PSMI_MIN_ADDR_FMT_SUPPORTED 1 -#define PSMI_ADDR_FMT_DEFAULT 2 - -#define PSMI_IPS_ADDR_FMT_IS_VALID(addr_fmt) ((addr_fmt) == PSMI_ADDR_FMT_IPATH \ - || (addr_fmt) == PSMI_ADDR_FMT_OPA) - -#else #define PSMI_ADDR_FMT_IB 3 // IB/OPA UD Verbs #define PSMI_ADDR_FMT_IPV4 4 // Eth UD/UDP IPv4 //5 // unused @@ -208,13 +194,11 @@ typedef enum { PSMI_ETH_PROTO_TCP =2, // TCP PSMI_ETH_PROTO_NA =0xff // internal only when N/A } psmi_eth_proto_t; -#endif // build addresses from basic information, mostly for use in low level // routines like get_port_subnet in HAL psmi_subnet128_t psm3_build_ib_subnet128(uint64_t hi); psmi_naddr128_t psm3_build_ib_naddr128(psmi_gid128_t gid); -#ifndef PSM_OPA psmi_subnet128_t psm3_build_ipv4_subnet128(uint32_t ip_addr, uint32_t ip_netmask, uint8_t prefix_len); psmi_naddr128_t psm3_build_ipv4_naddr128(uint32_t ip_addr, uint8_t prefix_len); @@ -222,9 +206,7 @@ psmi_subnet128_t psm3_build_ipv6_subnet128(psmi_bare_netaddr128_t ipv6_addr, psmi_bare_netaddr128_t ipv6_netmask, uint8_t prefix_len); psmi_naddr128_t psm3_build_ipv6_naddr128(psmi_bare_netaddr128_t ip_addr, uint8_t prefix_len); -#endif -#ifndef PSM_OPA // PSM3_SUBNETS specifies a comma separated list of Ethernet subnets which will // be considered for Ethernet ports. Ports which do not match any of the // entries will not be considered for use by PSM3. @@ -263,7 +245,6 @@ extern int psm3_num_allow_subnets; int psm3_allow_ib_subnet(uint64_t subnet); int psm3_allow_ipv4_subnet(uint32_t subnet, uint8_t prefix_len); int psm3_allow_ipv6_subnet(psmi_bare_netaddr128_t subnet, uint8_t prefix_len); -#endif /* PSM_OPA */ // variable to store NIC name wildcard if specified (def. psm.c) extern const char *psm3_nic_wildcard; @@ -296,15 +277,10 @@ const char *psm3_ipv6_fmt(psmi_bare_netaddr128_t ipv6_addr, uint8_t prefix_len, int bufno); const char *psm3_gid128_fmt(psmi_gid128_t gid, int bufno); const char *psm3_subnet128_fmt(psmi_subnet128_t subnet, int bufno); -#ifdef PSM_OPA -void psm3_subnet128_fmt_name(psmi_subnet128_t subnet, char *buf, int buflen); -#else void psm3_subnet128_fmt_name(psmi_eth_proto_t protocol, psmi_subnet128_t subnet, char *buf, int buflen); -#endif const char *psm3_naddr128_fmt(psmi_naddr128_t addr, int bufno); -#ifndef PSM_OPA // used for IPv4 netmask processing. A valid netmask has a sequence of 1s // and then all other bits are 0. // This counts how many 1s are in the high end of the netmask and confirms @@ -318,7 +294,6 @@ uint8_t psm3_compute_ipv4_prefix_len(uint32_t netmask); // the remaining low bits are 0. // returns 0 if netmask is invalid int psm3_compute_ipv6_prefix_len(psmi_bare_netaddr128_t netmask); -#endif #ifdef PSM_VERBS // search ifconfig for the given IPv4 ip_addr and return it's netmask diff --git a/psm3/psm_stats.c b/psm3/psm_stats.c index 2ff626a..6b2b0a2 100644 --- a/psm3/psm_stats.c +++ b/psm3/psm_stats.c @@ -56,9 +56,6 @@ #include "psm_user.h" #include "psm_mq_internal.h" #include -#ifdef PSM_OPA -#include "hal_gen1/gen1_service.h" // for OPA specific stats -#endif struct psmi_stats_type { STAILQ_ENTRY(psmi_stats_type) next; @@ -243,13 +240,6 @@ void psm3_stats_show(uint32_t statsmask) fprintf(perf_stats_fd, " %s%s%s\n", type->heading, type->info?" ":"", type->info?type->info:""); -#ifdef PSM_OPA - if (type->statstype == PSMI_STATSTYPE_DEVCOUNTERS || - type->statstype == PSMI_STATSTYPE_DEVSTATS) { - fprintf(perf_stats_fd, " skipping device stats\n"); - continue; - } -#endif for (i=0, entry=&type->entries[0]; inum_entries; i++, entry++) { uint64_t value; value = (entry->getfn != NULL)? entry->getfn(type->context) @@ -347,9 +337,7 @@ psm3_stats_initialize(void) psm3_getenv("PSM3_PRINT_STATSMASK", "Mask of statistic types to print: " "MQ=1, RCVTHREAD=0x100, IPS=0x200" -#ifdef PSM_OPA - ", TID=0x400" -#elif defined(PSM_HAVE_REG_MR) +#if defined(PSM_HAVE_REG_MR) ", RDMA=0x400, MRCache=0x800" #endif #ifdef PSM_DEBUG @@ -426,13 +414,6 @@ static uint32_t typestring_to_type(const char *typestr) else if ((strncasecmp(typestr, "tid", 4) == 0) || (strncasecmp(typestr, "tids", 5) == 0)) return PSMI_STATSTYPE_RDMA; -#ifdef PSM_OPA - else if ((strncasecmp(typestr, "counter", 8) == 0) || - (strncasecmp(typestr, "counters", 9) == 0)) - return PSMI_STATSTYPE_DEVCOUNTERS; - else if (strncasecmp(typestr, "devstats", 9) == 0) - return PSMI_STATSTYPE_DEVSTATS; -#endif else if ((strncasecmp(typestr, "memory", 7) == 0) || (strncasecmp(typestr, "alloc", 6) == 0) || (strncasecmp(typestr, "malloc", 7) == 0)) @@ -479,54 +460,6 @@ void psmi_stats_mpspawn_callback(struct mpspawn_stats_req_args *args) psmi_assert(num == type->num_entries); -#ifdef PSM_OPA - if (type->statstype == PSMI_STATSTYPE_DEVCOUNTERS || - type->statstype == PSMI_STATSTYPE_DEVSTATS) { - int unit_id = ((psm2_ep_t) type->context)->unit_id; - int portno = ((psm2_ep_t) type->context)->portnum; - uintptr_t off; - uint8_t *p = NULL; - int nc, npc, ns; - int nstats = psm3_gen1_get_stats_names_count(); - int nctrs = psm3_gen1_get_ctrs_unit_names_count(unit_id); - int npctrs = psm3_gen1_get_ctrs_port_names_count(unit_id); - - if (nctrs != -1 && npctrs != -1) - c = psmi_calloc(PSMI_EP_NONE, STATS, nctrs + npctrs, - sizeof(uint64_t)); - if (nstats != -1) - s = psmi_calloc(PSMI_EP_NONE, STATS, nstats, - sizeof(uint64_t)); - - /* - * If hfifs is not loaded, we set NAN everywhere. We don't want - * stats to break just because 1 node didn't have hfi-stats - */ - if (type->statstype == PSMI_STATSTYPE_DEVCOUNTERS && c != NULL) { - nc = psm3_gen1_get_ctrs_unit(unit_id, c, nctrs); - if (nc != -1 && nc == nctrs) - p = (uint8_t *) c; - if (nc == -1) - nc = 0; - npc = - psm3_gen1_get_ctrs_port(unit_id, portno, c + nc, npctrs); - if (!p && npc > 0 && npc == npctrs) - p = (uint8_t *) c; - } else if (s != NULL) { - ns = psm3_gen1_get_stats(s, nstats); - if (ns != -1) - p = (uint8_t *) s; - } - for (i = 0; i < num; i++) { - entry = &type->entries[i]; - if (p) { - off = (uintptr_t) entry->u.off; - stats[i] = *((uint64_t *) (p + off)); - } else - stats[i] = MPSPAWN_NAN_U64; - } - } else -#endif if (type->statstype == PSMI_STATSTYPE_MEMORY) { for (i = 0; i < num; i++) { entry = &type->entries[i]; @@ -582,10 +515,6 @@ stats_register_mpspawn_single(mpspawn_stats_add_fn add_fn, return; } -#ifdef PSM_OPA -static void stats_register_hfi_counters(psm2_ep_t ep); -static void stats_register_hfi_stats(psm2_ep_t ep); -#endif static void stats_register_mem_stats(psm2_ep_t ep); static psm2_error_t psmi_stats_epaddr_register(struct mpspawn_stats_init_args *args); @@ -615,16 +544,6 @@ void *psmi_stats_register(struct mpspawn_stats_init_args *args) if (statsmask & PSMI_STATSTYPE_MQ) psm3_mq_stats_register(args->mq, args->add_fn); -#ifdef PSM_OPA - if (psm3_ep_device_is_enabled(ep, PTL_DEVID_IPS)) { - /* PSM and hfi level statistics */ - if (statsmask & PSMI_STATSTYPE_DEVCOUNTERS) - stats_register_hfi_counters(args->mq->ep); - - if (statsmask & PSMI_STATSTYPE_DEVSTATS) - stats_register_hfi_stats(args->mq->ep); - } -#endif if (statsmask & PSMI_STATSTYPE_MEMORY) stats_register_mem_stats(args->mq->ep); @@ -841,96 +760,7 @@ psmi_stats_epaddr_register(struct mpspawn_stats_init_args *args) return err; } -#ifdef PSM_OPA -static -void stats_register_hfi_counters(psm2_ep_t ep) -{ - int i, nc, npc; - char *cnames = NULL, *pcnames = NULL; - struct psmi_stats_entry *entries = NULL; - - nc = psm3_gen1_get_ctrs_unit_names(ep->unit_id, &cnames); - if (nc == -1 || cnames == NULL) - goto bail; - npc = psm3_gen1_get_ctrs_port_names(ep->unit_id, &pcnames); - if (npc == -1 || pcnames == NULL) - goto bail; - entries = - psmi_calloc(ep, STATS, nc + npc, sizeof(struct psmi_stats_entry)); - if (entries == NULL) - goto bail; - - for (i = 0; i < nc; i++) { - entries[i].desc = psm3_gen1_get_next_name(&cnames); - entries[i].flags = MPSPAWN_STATS_REDUCTION_ALL | - MPSPAWN_STATS_SKIP_IF_ZERO; - entries[i].getfn = NULL; - entries[i].u.off = i * sizeof(uint64_t); - } - for (i = nc; i < nc + npc; i++) { - entries[i].desc = psm3_gen1_get_next_name(&pcnames); - entries[i].flags = MPSPAWN_STATS_REDUCTION_ALL | - MPSPAWN_STATS_SKIP_IF_ZERO; - entries[i].getfn = NULL; - entries[i].u.off = i * sizeof(uint64_t); - } - psm3_stats_register_type("OPA_device_counters", - PSMI_STATSTYPE_DEVCOUNTERS, - entries, nc + npc, ep, ep->dev_name); - // psm3_stats_register_type makes it's own copy of entries - // so we should free the entries buffer. - // The snames will be freed when we deregister the hfi. - psmi_free(entries); - return; - -bail: - if (cnames != NULL) - psm3_gen1_release_names(cnames); - if (pcnames != NULL) - psm3_gen1_release_names(pcnames); - if (entries != NULL) - psmi_free(entries); -} -#endif - -#ifdef PSM_OPA -static -void stats_register_hfi_stats(psm2_ep_t ep) -{ - int i, ns; - char *snames = NULL; - struct psmi_stats_entry *entries = NULL; - - ns = psm3_gen1_get_stats_names(&snames); - if (ns <= 0 || snames == NULL) - goto bail; - entries = psmi_calloc(ep, STATS, ns, sizeof(struct psmi_stats_entry)); - if (entries == NULL) - goto bail; - - for (i = 0; i < ns; i++) { - entries[i].desc = psm3_gen1_get_next_name(&snames); - entries[i].flags = MPSPAWN_STATS_REDUCTION_ALL | - MPSPAWN_STATS_SKIP_IF_ZERO; - entries[i].getfn = NULL; - entries[i].u.off = i * sizeof(uint64_t); - } - psm3_stats_register_type("OPA_device_statistics", - PSMI_STATSTYPE_DEVSTATS, entries, ns, ep, - ep->dev_name); - // psm3_stats_register_type makes it's own copy of entries - // so we should free the entries buffer. - // The snames will be freed when we deregister the hfi. - psmi_free(entries); - return; -bail: - if (snames != NULL) - psm3_gen1_release_names(snames); - if (entries != NULL) - psmi_free(entries); -} -#endif #undef _SDECL #define _SDECL(_desc, _param) { \ diff --git a/psm3/psm_stats.h b/psm3/psm_stats.h index 2cb7922..7587581 100644 --- a/psm3/psm_stats.h +++ b/psm3/psm_stats.h @@ -79,11 +79,6 @@ #define PSMI_STATSTYPE_RV_RDMA 0x04000 /* RV shared conn RDMA */ #endif /* PSM_VERBS */ #define PSMI_STATSTYPE_FAULTINJ 0x08000 /* fault injection - PSM_FI */ -#ifdef PSM_OPA -#define PSMI_STATSTYPE_DEVCOUNTERS 0x10000 -#define PSMI_STATSTYPE_DEVSTATS 0x20000 -#define _PSMI_STATSTYPE_DEVMASK 0xf0000 -#endif #define PSMI_STATSTYPE_ALL 0xfffff #define _PSMI_STATSTYPE_SHOWZERO 0x100000 diff --git a/psm3/psm_user.h b/psm3/psm_user.h index 22568c4..495b740 100644 --- a/psm3/psm_user.h +++ b/psm3/psm_user.h @@ -81,17 +81,11 @@ extern "C" { #endif /* This indicates at least 1 HAL in the build can perform Send DMA */ -#ifdef PSM_OPA -#define PSM_HAVE_SDMA -#endif #ifdef PSM_VERBS #define PSM_HAVE_SDMA #endif /* This indicates at least 1 HAL in the build can perform RDMA */ -#ifdef PSM_OPA -#define PSM_HAVE_RDMA -#endif #ifdef PSM_VERBS #define PSM_HAVE_RDMA #endif @@ -110,9 +104,6 @@ extern "C" { #endif /* UD || (UDP & CUDA) */ #endif /* RNDV_MOD */ -#if defined(PSM_ONEAPI) && defined(PSM_OPA) -#error "No support for OneAPI ZE for OPA" -#endif #include "psm_config.h" #include @@ -194,9 +185,7 @@ psm2_error_t psm3_mq_wait_internal(psm2_mq_req_t *ireq); int psm3_get_current_proc_location(); -#ifndef PSM_OPA extern int psm3_allow_routers; -#endif extern uint32_t non_dw_mul_sdma; extern psmi_lock_t psm3_creation_lock; extern psm2_ep_t psm3_opened_endpoint; diff --git a/psm3/psm_utils.c b/psm3/psm_utils.c index 854b5e8..65529b2 100644 --- a/psm3/psm_utils.c +++ b/psm3/psm_utils.c @@ -104,43 +104,6 @@ typedef union { uint32_t addr_fmt:3; uint32_t rest:29; }; -#ifdef PSM_OPA - struct { // InfiniPath shm and self when IPS device disabled - uint32_t addr_fmt:3; // = PSMI_ADDR_FMT_SHM - uint32_t reserved1:2; // = 0 - uint32_t rank_low:3; // rank bits 0-2 - uint32_t rank_high:8; // rank bits 3-11 - uint32_t job_key:16; // low 16 bits of uuid_t job_key - uint32_t rank:30; - uint32_t reserved2:2; - } v1_shm; - struct { // InfiniPath - uint32_t addr_fmt:3; // = PSMI_ADDR_FMT_IPATH - uint32_t unit:2; - uint32_t subctxt:3; - uint32_t context:8; - uint32_t lid:16; - uint32_t subnet:30; // low 30 bits of subnet_prefix - uint32_t reserved:2; - } v1; - struct { // OPA100 Native - uint32_t addr_fmt:3; // = PSMI_ADDR_FMT_OPA - uint32_t shm_only:1; // = 0 - uint32_t reserved1:1; - uint32_t subctxt:3; - uint32_t context:8; - uint32_t lid:16; - uint32_t reserved2:16; - uint32_t subnet:16; // low 16 bits of subnet_prefix - } v2; - struct { // OPA100 Native shm and self when IPS device disabled - uint32_t addr_fmt:3; // = PSMI_ADDR_FMT_OPA - uint32_t shm_only:1; // = PSMI_EPID_SHM_ONLY - uint32_t reserved:28; - uint32_t pid:32; - } v2_shm; - // addr_fmt>2 invalid -#else struct { // shm and self when IPS device disabled uint32_t addr_fmt:3; // = PSMI_ADDR_FMT_SHM // TBD don't need shm_only field anymore, EPID will be non-zero @@ -201,7 +164,6 @@ typedef union { uint64_t gid_hi; // subnet_prefix uint64_t gid_lo; // interface_id } v6; -#endif } psmi_epid_t; int psm3_ep_device_is_enabled(const psm2_ep_t ep, int devid); @@ -535,53 +497,6 @@ static inline psmi_bare_netaddr128_t psmi_prefix_len_to_ipv6_netmask(int count) /* These functions build the local epid */ /* This is a typical multi-node job */ -#ifdef PSM_OPA -psm2_epid_t psm3_epid_pack_ips(uint16_t lid, uint8_t context, - uint8_t subcontext, uint8_t hfiunit, psmi_naddr128_t addr) -{ - psmi_epid_t epid; - - psmi_assert(sizeof(psm2_epid_t) == sizeof(psmi_epid_t)); - psmi_assert(addr.fmt == psm3_addr_fmt); - psmi_assert(addr.prefix_len == 64); - switch (psm3_addr_fmt) { - case PSMI_ADDR_FMT_IPATH: - epid.v1.addr_fmt = PSMI_ADDR_FMT_IPATH; - epid.v1.unit = hfiunit; - epid.v1.subctxt = subcontext; - epid.v1.context = context; - epid.v1.lid = lid; - epid.v1.subnet = 0x3ffffff; - epid.v1.reserved = 0; - epid.w[1] = 0; - epid.w[2] = 0; - - psmi_assert(psm3_epid_addr_fmt(epid.psm2_epid) == PSMI_ADDR_FMT_IPATH); - break; - case PSMI_ADDR_FMT_OPA: - epid.v2.addr_fmt = PSMI_ADDR_FMT_OPA; - epid.v2.shm_only = PSMI_EPID_IPS_SHM; - epid.v2.reserved1 = 0; - epid.v2.subctxt = subcontext; - epid.v2.context = context; - epid.v2.lid = lid; - epid.v2.reserved2 = 0; - epid.v2.subnet = addr.bare.hi & 0xffff; - epid.w[1] = 0; - epid.w[2] = 0; - psmi_assert(psm3_epid_addr_fmt(epid.psm2_epid) == PSMI_ADDR_FMT_OPA); - break; - default: - /* Epid addr_fmt is greater than max supported formats. */ - psmi_assert_always(psm3_addr_fmt <= PSMI_ADDR_FMT_OPA); - psmi_assert_always(psm3_addr_fmt != PSMI_ADDR_FMT_SHM); - epid.w[0] = 0; // keep compiler happy, never reached - break; - } - psmi_assert(sizeof(psm2_epid_t) == sizeof(psmi_epid_t)); - return epid.psm2_epid; -} -#else // PSM_OPA // IB or OPA with Verbs psm2_epid_t psm3_epid_pack_ib(uint16_t lid, uint32_t qp_num, psmi_naddr128_t addr) @@ -696,9 +611,7 @@ psm2_epid_t psm3_epid_pack_ipv6(psmi_naddr128_t ipv6_addr, psmi_assert(psm3_epid_protocol(epid.psm2_epid) == protocol); return epid.psm2_epid; } -#endif // PSM_OPA -#ifndef PSM_OPA // find the 1st IPv4 or IPv6 address (excluding loopback) in the node // we will use this as the NID for a FMT_SHM EPID so we can detect incorrect // attempts to run a multi-node job across shm (or self) @@ -746,13 +659,11 @@ static void psmi_get_shm_nid(uint64_t *gid_hi, uint64_t *gid_lo) } return; } -#endif /* PSM_OPA */ /* This is a shm-only epid (single node job) */ psm2_epid_t psm3_epid_pack_shm(const psm2_uuid_t unique_job_key) { psmi_epid_t epid; -#ifndef PSM_OPA // TBD - possible duplicate epid for shm-only job with multi-ep // but probably not an issue since we don't cross connect shm ep's? epid.shm.addr_fmt = PSMI_ADDR_FMT_SHM; @@ -761,57 +672,6 @@ psm2_epid_t psm3_epid_pack_shm(const psm2_uuid_t unique_job_key) epid.shm.pid = getpid(); psmi_get_shm_nid(&epid.shm.gid_hi, &epid.shm.gid_lo); psmi_assert(psm3_epid_addr_fmt(epid.psm2_epid) == PSMI_ADDR_FMT_SHM); -#else - int rank; - - /* In shm-only mode, we need to derive a valid epid - * based on our rank. We try to get it from the - * environment if its available, or resort to using - * our PID as the rank. - */ - rank = psm3_get_mylocalrank(); - if (rank < 0) - rank = getpid(); - - /* - * We use a LID of 0 for non-HFI communication. - * Since a jobkey is not available from IPS, pull the - * first 16 bits from the UUID. - */ - switch (psm3_addr_fmt) { - case PSMI_ADDR_FMT_IPATH: - // OPA did it like this, odd to specify addr_fmt SHM but - // pack into a V1 format - epid.v1_shm.addr_fmt = PSMI_ADDR_FMT_SHM; - epid.v1_shm.reserved1 = 0; - epid.v1_shm.rank_low = rank & 0x7; - epid.v1_shm.rank_high = rank >> 3; - epid.v1_shm.job_key = ((uint16_t *) unique_job_key)[0]; - epid.v1_shm.rank = rank; - epid.v1_shm.reserved2 = 0; - epid.w[1] = 0; - epid.w[2] = 0; - psmi_assert(psm3_epid_addr_fmt(epid.psm2_epid) == PSMI_ADDR_FMT_SHM); - break; - case PSMI_ADDR_FMT_OPA: - // also odd we pack a special varision of V2 format for SHM - // but call it v2, flag tells us it's odd, no one checks - epid.v2_shm.addr_fmt = PSMI_ADDR_FMT_OPA; - epid.v2_shm.shm_only = PSMI_EPID_SHM_ONLY; - epid.v2_shm.reserved = 0; - epid.v2_shm.pid = getpid(); - epid.w[1] = 0; - epid.w[2] = 0; - psmi_assert(psm3_epid_addr_fmt(epid.psm2_epid) == PSMI_ADDR_FMT_OPA); - break; - default: - /* Epid addr_fmt is greater than max supported addr_fmt. */ - psmi_assert_always(psm3_addr_fmt <= PSMI_ADDR_FMT_OPA); - psmi_assert_always(psm3_addr_fmt != PSMI_ADDR_FMT_SHM); - epid.w[0] = 0; // keep compiler happy, never reached - break; - } -#endif // PSM_OPA psmi_assert(sizeof(psm2_epid_t) == sizeof(psmi_epid_t)); return epid.psm2_epid; } @@ -820,7 +680,6 @@ psm2_epid_t psm3_epid_pack_shm(const psm2_uuid_t unique_job_key) psm2_epid_t psm3_epid_pack_self(void) { psmi_epid_t epid; -#ifndef PSM_OPA // TBD - possible duplicate epid for self-only job with multi-ep // but probably not an issue since we don't cross connect self ep's? epid.shm.addr_fmt = PSMI_ADDR_FMT_SHM; @@ -828,37 +687,6 @@ psm2_epid_t psm3_epid_pack_self(void) epid.shm.reserved = 0; epid.shm.pid = getpid(); psmi_get_shm_nid(&epid.shm.gid_hi, &epid.shm.gid_lo); -#else - switch (psm3_addr_fmt) { - case PSMI_ADDR_FMT_IPATH: - epid.v1_shm.addr_fmt = PSMI_ADDR_FMT_SHM; - epid.v1_shm.reserved1 = 0; - epid.v1_shm.rank_low = 0; - epid.v1_shm.rank_high = 0; - epid.v1_shm.job_key = 0; - epid.v1_shm.rank = 0x3ffffff; - epid.v1_shm.reserved2 = 0; - epid.w[1] = 0; - epid.w[2] = 0; - psmi_assert(psm3_epid_addr_fmt(epid.psm2_epid) == PSMI_ADDR_FMT_SHM); - break; - case PSMI_ADDR_FMT_OPA: - epid.v2_shm.addr_fmt = PSMI_ADDR_FMT_OPA; - epid.v2_shm.shm_only = PSMI_EPID_SHM_ONLY; - epid.v2_shm.reserved = 0; - epid.v2_shm.pid = 0; - epid.w[1] = 0; - epid.w[2] = 0; - psmi_assert(psm3_epid_addr_fmt(epid.psm2_epid) == PSMI_ADDR_FMT_OPA); - break; - default: - /* Epid addr_fmt is greater than max supportd addr_fmt. */ - psmi_assert_always(psm3_addr_fmt <= PSMI_ADDR_FMT_OPA); - psmi_assert_always(psm3_addr_fmt != PSMI_ADDR_FMT_SHM); - epid.w[0] = 0; // keep compiler happy, never reached - break; - } -#endif // PSM_OPA psmi_assert(sizeof(psm2_epid_t) == sizeof(psmi_epid_t)); return epid.psm2_epid; } @@ -868,21 +696,11 @@ psm2_epid_t psm3_epid_pack_diag(int val) { // just need a valid epid which is different for each val given psmi_epid_t epid; -#ifndef PSM_OPA epid.shm.addr_fmt = PSMI_ADDR_FMT_SHM; epid.shm.shm_only = PSMI_EPID_SHM_ONLY; epid.shm.reserved = 0; epid.shm.pid = val; psmi_get_shm_nid(&epid.shm.gid_hi, &epid.shm.gid_lo); -#else - epid.v2_shm.addr_fmt = PSMI_ADDR_FMT_OPA; - epid.v2_shm.shm_only = PSMI_EPID_SHM_ONLY; - epid.v2_shm.reserved = 0; - epid.v2_shm.pid = val; - epid.w[1] = 0; - epid.w[2] = 0; - psmi_assert(psm3_epid_addr_fmt(epid.psm2_epid) == PSMI_ADDR_FMT_OPA); -#endif psmi_assert(sizeof(psm2_epid_t) == sizeof(psmi_epid_t)); return epid.psm2_epid; } @@ -894,7 +712,6 @@ uint8_t psm3_epid_addr_fmt(psm2_epid_t epid) return e.addr_fmt; } -#ifndef PSM_OPA psmi_eth_proto_t psm3_epid_protocol(psm2_epid_t epid) { psmi_epid_t e = { .psm2_epid = epid }; @@ -920,7 +737,6 @@ psmi_eth_proto_t psm3_epid_protocol(psm2_epid_t epid) break; } } -#endif // The network id (address) from the epid // depending on epid addr_fmt this may be a lid/subnet or ipv4 address @@ -934,26 +750,6 @@ psm2_nid_t psm3_epid_nid(psm2_epid_t epid) { psmi_epid_t ret = { .psm2_epid = epid }; switch (ret.addr_fmt) { -#ifdef PSM_OPA - case PSMI_ADDR_FMT_SHM: - ret.v1_shm.rank_low = 0; - ret.v1_shm.rank_high = 0; - ret.v1_shm.rank = 0; - ret.v1_shm.reserved1 = 1; // make sure nid != 0 - break; - case PSMI_ADDR_FMT_IPATH: - ret.v1.subctxt = 0; - ret.v1.context = 0; - break; - case PSMI_ADDR_FMT_OPA: - if (ret.v2.shm_only) { - ret.v2_shm.pid = 0; - } else { - ret.v2.subctxt = 0; - ret.v2.context = 0; - } - break; -#else // PSM_OPA case PSMI_ADDR_FMT_SHM: ret.shm.pid = 0; break; @@ -968,7 +764,6 @@ psm2_nid_t psm3_epid_nid(psm2_epid_t epid) ret.v6.protocol = 0; ret.v6.context = 0; break; -#endif // PSM_OPA default: psmi_assert_always(0); // unexpected addr_fmt break; @@ -983,15 +778,7 @@ psm2_nid_t psm3_epid_nid(psm2_epid_t epid) // Only valid for subnet used in remote IPS connections static psmi_subnet128_t psmi_subnet_epid_subset(psmi_subnet128_t subnet) { -#ifdef PSM_OPA - psmi_subnet128_t ret = subnet; - // TBD for FMT_IPATH subnet.bare.hi == 0x3ffffff; - // for OPA we only pass low 16 bits of subnet in epid - ret.bare.hi &= 0xffff; - return ret; -#else return subnet; -#endif } // Get the subnet for the given EPID. @@ -1001,29 +788,6 @@ psmi_subnet128_t psm3_epid_subnet(psm2_epid_t epid) { psmi_epid_t e = { .psm2_epid = epid }; psmi_subnet128_t ret = { }; -#ifdef PSM_OPA - switch (e.addr_fmt) { - case PSMI_ADDR_FMT_IPATH: - ret.bare.hi = e.v1.subnet; - ret.bare.lo = 0; - ret.fmt = PSMI_ADDR_FMT_IPATH; - ret.prefix_len = 64; - return ret; - break; - case PSMI_ADDR_FMT_OPA: - ret.bare.hi = e.v2.subnet; - ret.bare.lo = 0; - ret.fmt = PSMI_ADDR_FMT_OPA; - ret.prefix_len = 64; - return ret; - break; - case PSMI_ADDR_FMT_SHM: - default: - psmi_assert_always(0); // unexpected addr_fmt - return ret; // keep compiler happy, never reached - break; - } -#else // PSM_OPA psmi_bare_netaddr128_t nm; switch (e.addr_fmt) { case PSMI_ADDR_FMT_SHM: // only called for remote IPS connections @@ -1057,7 +821,6 @@ psmi_subnet128_t psm3_epid_subnet(psm2_epid_t epid) return ret; // keep compiler happy, never reached break; } -#endif // PSM_OPA } // Get the subnet prefix_len for the given EPID. @@ -1065,21 +828,6 @@ psmi_subnet128_t psm3_epid_subnet(psm2_epid_t epid) uint8_t psm3_epid_prefix_len(psm2_epid_t epid) { psmi_epid_t e = { .psm2_epid = epid }; -#ifdef PSM_OPA - switch (e.addr_fmt) { - case PSMI_ADDR_FMT_IPATH: - return 64; - break; - case PSMI_ADDR_FMT_OPA: - return 64; - break; - case PSMI_ADDR_FMT_SHM: - default: - psmi_assert_always(0); // unexpected addr_fmt - return 0; // keep compiler happy, never reached - break; - } -#else // PSM_OPA switch (e.addr_fmt) { case PSMI_ADDR_FMT_SHM: // only called for remote IPS connections psmi_assert_always(0); // unexpected addr_fmt @@ -1099,7 +847,6 @@ uint8_t psm3_epid_prefix_len(psm2_epid_t epid) return 0; // keep compiler happy, never reached break; } -#endif // PSM_OPA } // The locally unique identifiers for the HW resources @@ -1110,23 +857,6 @@ uint8_t psm3_epid_prefix_len(psm2_epid_t epid) uint64_t psm3_epid_context(psm2_epid_t epid) { psmi_epid_t e = { .psm2_epid = epid }; -#ifdef PSM_OPA - switch (e.addr_fmt) { - case PSMI_ADDR_FMT_SHM: // can be called by psm3_epid_fmt_addr - return e.v1_shm.rank_high; - break; - case PSMI_ADDR_FMT_IPATH: - return e.v1.context; - break; - case PSMI_ADDR_FMT_OPA: - return e.v2.context; - break; - default: - psmi_assert_always(0); // unexpected addr_fmt - return 0; // keep compiler happy, never reached - break; - } -#else // PSM_OPA switch (e.addr_fmt) { case PSMI_ADDR_FMT_SHM: // can be called by psm3_epid_fmt_addr return e.shm.pid; @@ -1167,7 +897,6 @@ uint64_t psm3_epid_context(psm2_epid_t epid) return 0; // keep compiler happy, never reached break; } -#endif // PSM_OPA } #ifdef PSM_SOCKETS @@ -1214,27 +943,6 @@ uint16_t psm3_epid_aux_socket(psm2_epid_t epid) } #endif /* PSM_SOCKETS */ -#ifdef PSM_OPA -uint64_t psm3_epid_subcontext(psm2_epid_t epid) -{ - psmi_epid_t e = { .psm2_epid = epid }; - switch (e.addr_fmt) { - case PSMI_ADDR_FMT_SHM: // can be called by psm3_epid_fmt_addr - return e.v1_shm.rank_low; - break; - case PSMI_ADDR_FMT_IPATH: - return e.v1.subctxt; - break; - case PSMI_ADDR_FMT_OPA: - return e.v2.subctxt; - break; - default: - psmi_assert_always(0); // unexpected addr_fmt - return 0; // keep compiler happy, never reached - break; - } -} -#endif // return appropriate LID to use // for Ethernet, 1 is returned but not used beyond checkng LID != 0 @@ -1242,14 +950,6 @@ uint16_t psm3_epid_lid(psm2_epid_t epid) { psmi_epid_t e = { .psm2_epid = epid }; switch (e.addr_fmt) { -#ifdef PSM_OPA - case PSMI_ADDR_FMT_IPATH: - return e.v1.lid; - break; - case PSMI_ADDR_FMT_OPA: - return e.v2.lid; - break; -#else case PSMI_ADDR_FMT_IB: return e.v3.lid; break; @@ -1259,7 +959,6 @@ uint16_t psm3_epid_lid(psm2_epid_t epid) case PSMI_ADDR_FMT_IPV6: return 1; // not really used, but lid must be != 0 break; -#endif default: psmi_assert_always(0); // unexpected addr_fmt return 0; // keep compiler happy, never reached @@ -1267,7 +966,6 @@ uint16_t psm3_epid_lid(psm2_epid_t epid) } } -#ifndef PSM_OPA // get information needed to build a verbs AV // gid returned in format suitable to build an IPv6 GID for AV // lid and gid in host byte order @@ -1325,14 +1023,10 @@ uint32_t psm3_epid_get_rem_addr(psm2_epid_t epid) break; } } -#endif // return a good portion of epid which can be used for hashing and randomizing uint64_t psm3_epid_hash(psm2_epid_t epid) { -#ifdef PSM_OPA - return epid.w[0]; -#else psmi_epid_t e = { .psm2_epid = epid }; switch (e.addr_fmt) { case PSMI_ADDR_FMT_SHM: @@ -1347,7 +1041,6 @@ uint64_t psm3_epid_hash(psm2_epid_t epid) return e.v6.gid_lo; break; } -#endif } /* @@ -1426,7 +1119,6 @@ int psm3_is_speed_allowed(int unit, uint64_t speed) } #undef MBPS -#ifndef PSM_OPA static int psm3_allow_subnet(const char *subnet, const char *subnet_type) { int i; @@ -1485,7 +1177,6 @@ int psm3_allow_ipv6_subnet(psmi_bare_netaddr128_t subnet, uint8_t prefix_len) const char *subnet_str = psmi_ipv6_ntop(subnet, prefix_len, buf, sizeof(buf)); return psm3_allow_subnet(subnet_str, "IPv6"); } -#endif /* PSM_OPA */ // build an IB/OPA subnet from basic addressing information psmi_subnet128_t psm3_build_ib_subnet128(uint64_t hi) @@ -1494,11 +1185,7 @@ psmi_subnet128_t psm3_build_ib_subnet128(uint64_t hi) subnet.bare.hi = hi; subnet.bare.lo = 0; -#ifdef PSM_OPA - subnet.fmt = psm3_addr_fmt; -#else subnet.fmt = PSMI_ADDR_FMT_IB; -#endif subnet.prefix_len = 64; return subnet; } @@ -1510,16 +1197,11 @@ psmi_naddr128_t psm3_build_ib_naddr128(psmi_gid128_t gid) addr.bare.hi = gid.hi; addr.bare.lo = gid.lo; -#ifdef PSM_OPA - addr.fmt = psm3_addr_fmt; -#else addr.fmt = PSMI_ADDR_FMT_IB; -#endif addr.prefix_len = 64; return addr; } -#ifndef PSM_OPA // build an IPv4 subnet from basic addressing information psmi_subnet128_t psm3_build_ipv4_subnet128(uint32_t ip_addr, uint32_t ip_netmask, uint8_t prefix_len) @@ -1568,7 +1250,6 @@ psmi_naddr128_t psm3_build_ipv6_naddr128(psmi_bare_netaddr128_t ip_addr, addr.prefix_len = prefix_len; return addr; } -#endif // build a NID from basic addressing information for later comparison to // psm3_epid_nid() @@ -1580,11 +1261,6 @@ psm2_nid_t psm3_build_nid(uint8_t unit, psmi_naddr128_t addr, unsigned lid) { psm2_nid_t ret = { }; // never used, keep compiler happy below switch (addr.fmt) { -#ifdef PSM_OPA - case PSMI_ADDR_FMT_IPATH: - case PSMI_ADDR_FMT_OPA: - return psm3_epid_pack_ips(lid, 0, 0, unit, addr); -#else case PSMI_ADDR_FMT_IB: return psm3_epid_pack_ib(lid, 0, addr); break; @@ -1594,40 +1270,18 @@ psm2_nid_t psm3_build_nid(uint8_t unit, psmi_naddr128_t addr, unsigned lid) case PSMI_ADDR_FMT_IPV6: return psm3_epid_pack_ipv6(addr, 0, 0, 0); break; -#endif default: psmi_assert_always(0); // unexpected addr_fmt return ret; // keep compiler happy, never reached } } -#ifdef PSM_OPA -// for IPS connect we get 1 extra 64b word -// to hold enough information to reconstruct the full psmi_subnet128_t from the -// epid and this value (for some addr_fmt, the epid can only reconstruct a -// subset of the subnet information) -uint64_t psm3_epid_subnet_extra_word(psmi_subnet128_t subnet) -{ - return subnet.bare.hi; -} -#endif -#ifdef PSM_OPA -psmi_subnet128_t psmi_subnet_pack(psm2_epid_t epid, uint64_t extra_word) -{ - return psm3_build_ib_subnet128(extra_word); -} -#endif // impose the addr_fmt specific rules for when we allow // diferent subnets to still be able to connect int psm3_subnets_match(psmi_subnet128_t a, psmi_subnet128_t b) { -#ifdef PSM_OPA - if (a.fmt != b.fmt) - return 0; // for PSM_OPA V1 and V2 can't interop - return (a.bare.hi == b.bare.hi); -#else int is_eth = PSMI_ADDR_FMT_IS_ETH(a.fmt); // note psm3_ep_connect enforces that addr_fmt must match // so when we get down to comparing subnets of each rail we @@ -1641,7 +1295,6 @@ int psm3_subnets_match(psmi_subnet128_t a, psmi_subnet128_t b) return ((a.prefix_len == b.prefix_len && a.bare.hi == b.bare.hi && a.bare.lo == b.bare.lo) || (is_eth && psm3_allow_routers)); -#endif } // compare our local subnet to a remote epids's subnet @@ -1742,25 +1395,16 @@ psm2_epid_t psm3_epid_pack_words(uint64_t w0, uint64_t w1, uint64_t w2) e.w[0] = w0; e.w[1] = w1; e.w[2] = w2; -#ifndef PSM_OPA psmi_assert(e.addr_fmt == PSMI_ADDR_FMT_SHM || PSMI_IPS_ADDR_FMT_IS_VALID(e.addr_fmt)); #ifdef PSM_DEBUG if (e.addr_fmt == PSMI_ADDR_FMT_IPV6 || e.addr_fmt == PSMI_ADDR_FMT_IB) psmi_assert(w1 != 0 || w2 != 0); else if (e.addr_fmt == PSMI_ADDR_FMT_IPV4) psmi_assert(w2 == 0); -#endif #endif return e.psm2_epid; } -#ifdef PSM_OPA -/* pack a single word epid */ -psm2_epid_t psm3_epid_pack_word(uint64_t w0) -{ - return psm3_epid_pack_words(w0, 0, 0); -} -#endif #if 0 psm2_epid_t psm2_epid_pack_word(uint64_t w0) @@ -1890,14 +1534,6 @@ const char *psm3_epid_str_addr_fmt(psm2_epid_t epid) case PSMI_ADDR_FMT_SHM: return "shm"; break; -#ifdef PSM_OPA - case PSMI_ADDR_FMT_IPATH: - return "Truescale"; - break; - case PSMI_ADDR_FMT_OPA: - return "OPA"; - break; -#else case PSMI_ADDR_FMT_IB: return "IB/OPA"; break; @@ -1907,14 +1543,12 @@ const char *psm3_epid_str_addr_fmt(psm2_epid_t epid) case PSMI_ADDR_FMT_IPV6: return "IPv6"; break; -#endif default: return "Unknown"; break; } } -#ifndef PSM_OPA // for error messages and psm3_subnet128_fmt_name // note that psm3_subnet128_fmt_name sets the OFI fi_info // fabric name @@ -1939,9 +1573,7 @@ static const char *psm3_protocol_str(psmi_eth_proto_t protocol) break; } } -#endif -#ifndef PSM_OPA // returns for psmi_epid_fmt_addr a string describing psm3_epid_context static const char *psm3_protocol_context_str(psmi_eth_proto_t protocol) { @@ -1962,14 +1594,11 @@ static const char *psm3_protocol_context_str(psmi_eth_proto_t protocol) break; } } -#endif -#ifndef PSM_OPA const char *psm3_epid_str_protocol(psm2_epid_t epid) { return psm3_protocol_str(psm3_epid_protocol(epid)); } -#endif // for sockets just show primary context // this is used for process label in logs if rank is not available @@ -1977,14 +1606,8 @@ const char *psm3_epid_fmt_context(psm2_epid_t epid, int bufno) { char *outstr = outstrbufs[bufno]; -#ifdef PSM_OPA - snprintf(outstr, sizeof(outstrbufs[0]), "%u.%u", - (uint8_t)psm3_epid_context(epid), - (uint8_t)psm3_epid_subcontext(epid)); -#else snprintf(outstr, sizeof(outstrbufs[0]), "%u", (uint32_t)psm3_epid_context(epid)); -#endif return outstr; } @@ -1999,30 +1622,6 @@ const char *psm3_nid_fmt(psm2_nid_t nid, int bufno) { char *outstr = outstrbufs[bufno]; psmi_epid_t e = { .psm2_epid = nid }; -#ifdef PSM_OPA - - switch (e.addr_fmt) { - case PSMI_ADDR_FMT_SHM: - snprintf(outstr, sizeof(outstrbufs[0]), "RANK=%u", - (uint32_t)e.v1_shm.rank); - break; - case PSMI_ADDR_FMT_IPATH: - snprintf(outstr, sizeof(outstrbufs[0]), "LID=%u", - (uint16_t)e.v1.lid); - break; - case PSMI_ADDR_FMT_OPA: - if ( e.v2.shm_only) - snprintf(outstr, sizeof(outstrbufs[0]), "PID=%u", - (uint32_t)e.v2_shm.pid); - else - snprintf(outstr, sizeof(outstrbufs[0]), "LID=%u", - (uint16_t)e.v2.lid); - break; - default: - psmi_assert_always(0); // unexpected addr_fmt - break; - } -#else // PSM_OPA psmi_bare_netaddr128_t bare; char buf[INET6_ADDRSTRLEN+4]; @@ -2052,7 +1651,6 @@ const char *psm3_nid_fmt(psm2_nid_t nid, int bufno) psmi_assert_always(0); // unexpected addr_fmt break; } -#endif return outstr; } @@ -2082,36 +1680,6 @@ const char *psm3_epid_fmt_addr(psm2_epid_t epid, int bufno) { char *outstr = outstrbufs[bufno]; psmi_epid_t e = { .psm2_epid = epid }; -#ifdef PSM_OPA - - switch (e.addr_fmt) { - case PSMI_ADDR_FMT_SHM: - snprintf(outstr, sizeof(outstrbufs[0]), "RANK=%u", - (uint32_t)e.v1_shm.rank); - break; - case PSMI_ADDR_FMT_IPATH: - snprintf(outstr, sizeof(outstrbufs[0]), "LID=%u:%u.%u", - (uint16_t)e.v1.lid, - (uint16_t)e.v1.context, - (uint16_t)e.v1.subctxt); - break; - case PSMI_ADDR_FMT_OPA: - if ( e.v2.shm_only) - snprintf(outstr, sizeof(outstrbufs[0]), "PID=%u", - (uint32_t)e.v2_shm.pid); - else - snprintf(outstr, sizeof(outstrbufs[0]), "LID=%u:%u.%u", - (uint16_t)e.v2.lid, - (uint16_t)e.v2.context, - (uint16_t)e.v2.subctxt); - break; - default: - // might be called in psm3_handle_error, so output something instead of - // asserting - snprintf(outstr, sizeof(outstrbufs[0]), "Invalid Fmt"); - break; - } -#else // PSM_OPA psmi_bare_netaddr128_t bare; char buf[INET6_ADDRSTRLEN+4]; @@ -2170,7 +1738,6 @@ const char *psm3_epid_fmt_addr(psm2_epid_t epid, int bufno) snprintf(outstr, sizeof(outstrbufs[0]), "Invalid Fmt"); break; } -#endif return outstr; } @@ -2193,34 +1760,15 @@ const char *psm3_subnet128_fmt(psmi_subnet128_t subnet, int bufno) char *outstr = outstrbufs[bufno]; // TBD - handle V_SHM -#ifndef PSM_OPA if (subnet.fmt == PSMI_ADDR_FMT_IPV4) return psmi_ipv4_ntop(subnet.bare.lo, subnet.prefix_len, outstr, sizeof(outstrbufs[0])); else if (subnet.fmt == PSMI_ADDR_FMT_IPV6) return psmi_ipv6_ntop(subnet.bare, subnet.prefix_len, outstr, sizeof(outstrbufs[0])); else snprintf(outstr, sizeof(outstrbufs[0]), "0x%"PRIx64, subnet.bare.hi); -#else - snprintf(outstr, sizeof(outstrbufs[0]), "0x%"PRIx64, subnet.bare.hi); -#endif return outstr; } -#ifdef PSM_OPA -void psm3_subnet128_fmt_name(psmi_subnet128_t subnet, - char *buf, int buflen) -{ - switch (subnet.fmt) { - case PSMI_ADDR_FMT_IPATH: - case PSMI_ADDR_FMT_OPA: - snprintf(buf, buflen, "OPA-%s", psm3_subnet128_fmt(subnet, 0)); - break; - default: - psmi_assert_always(0); // unexpected addr_fmt - break; - } -} -#else /* PSM_OPA */ void psm3_subnet128_fmt_name(psmi_eth_proto_t protocol, psmi_subnet128_t subnet, char *buf, int buflen) { @@ -2251,7 +1799,6 @@ void psm3_subnet128_fmt_name(psmi_eth_proto_t protocol, psmi_subnet128_t subnet, break; } } -#endif /* PSM_OPA */ /* this returns just the subnet from decoding the epid * output has subnet in a more human readable format @@ -2269,7 +1816,6 @@ const char *psm3_epid_fmt_subnet(psm2_epid_t epid, int bufno) // IB/OPA addresses shown as a full 128b GID const char *psm3_naddr128_fmt(psmi_naddr128_t addr, int bufno) { -#ifndef PSM_OPA char *outstr = outstrbufs[bufno]; if (addr.fmt == PSMI_ADDR_FMT_IPV4) @@ -2278,9 +1824,6 @@ const char *psm3_naddr128_fmt(psmi_naddr128_t addr, int bufno) return psmi_ipv6_ntop(addr.bare, addr.prefix_len, outstr, sizeof(outstrbufs[0])); else return psm3_gid128_fmt(addr.bare, bufno); -#else - return psm3_gid128_fmt(addr.bare, bufno); -#endif } #ifdef PSM_VERBS @@ -2363,7 +1906,6 @@ int psm3_epid_cmp_internal(psm2_epid_t a, psm2_epid_t b) int ret; ret = psm3_epid_cmp_word(a.w[0], b.w[0]); -#ifndef PSM_OPA if (ret) return ret; // 1st word's match, so addr_fmt should match @@ -2372,9 +1914,6 @@ int psm3_epid_cmp_internal(psm2_epid_t a, psm2_epid_t b) if (ret) return ret; return psm3_epid_cmp_word(a.w[2], b.w[2]); -#else - return ret; -#endif } int psm3_epid_cmp(psm2_epid_t a, psm2_epid_t b) @@ -2716,7 +2255,6 @@ int psm3_get_eth_ipv4_netmask(uint32_t ip_addr, uint32_t *netmask) } #endif /* PSM_VERBS */ -#ifndef PSM_OPA // used for IPv4 netmask processing. A valid netmask has a sequence of 1s // and then all other bits are 0. // This counts how many 1s are in the high end of the netmask and confirms @@ -2769,7 +2307,6 @@ int psm3_compute_ipv6_prefix_len(psmi_bare_netaddr128_t netmask) return i; } } -#endif #ifdef PSM_VERBS // given an IPv6 address, figure out which ifconfig entry matches and @@ -3254,17 +2791,10 @@ unsigned psmi_parse_gpudirect(void) return saved; psm3_getenv("PSM3_GPUDIRECT", -#ifdef PSM_OPA - "Use GPUDirect RDMA support to allow the NIC to directly read" - " from the GPU for SDMA and write to the GPU for TID RDMA." - " Also enable GPUDirect copy for more efficient CPU to/from GPU copies." - " Requires driver support.(default is disabled i.e. 0)", -#else "Use GPUDirect DMA and RDMA support to allow the NIC to directly read" " from the GPU for send DMA and write to the GPU for recv RDMA." " Also enable GPUDirect copy for more efficient CPU to/from GPU copies." " Requires rv module support.(default is disabled i.e. 0)", -#endif PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, (union psmi_envvar_val)0, /* Disabled by default */ &envval); @@ -5059,13 +4589,8 @@ static const char * OpcodeString(int opcode) case OPCODE_LONG_RTS: return "RTS"; case OPCODE_LONG_CTS: return "CTS"; case OPCODE_LONG_DATA: return "DATA"; -#ifdef PSM_OPA - case OPCODE_EXPTID: return "EXPTID"; - case OPCODE_EXPTID_COMPLETION: return "EXPTID_COMPLETION"; -#else case OPCODE_ERR_CHK_RDMA: return "ERR_CHK_RDMA"; case OPCODE_ERR_CHK_RDMA_RESP: return "ERR_CHK_RDMA_RESP"; -#endif default: return "UNKNOWN"; } } @@ -5420,13 +4945,6 @@ void psmi_log_message(const char *fileName, { txrx = va_arg(ap,psmi_log_tx_rx_t); dumpAddr[0] = va_arg(ap,struct ips_message_header *); -#ifdef PSM_OPA - if (txrx == PSM2_LOG_RX) - { - dumpAddr[1] = va_arg(ap,uint32_t *); - dumpSize[1] = sizeof(uint64_t); - } -#endif newFormat = va_arg(ap,const char *); dumpSize[0] = sizeof(struct ips_message_header); } @@ -5541,15 +5059,8 @@ void psmi_log_message(const char *fileName, } else if (format == PSM2_LOG_PKT_STRM_MAGIC) { -#ifdef PSM_OPA - MY_FPRINTF(IO_PORT,"PKT_STRM: %s: imh: %p%s ", TxRxString(txrx), - dumpAddr[0], (txrx == PSM2_LOG_RX) ? "," : ""); - if (txrx == PSM2_LOG_RX) - MY_FPRINTF(IO_PORT,"rhf: %p ", dumpAddr[1]); -#else MY_FPRINTF(IO_PORT,"PKT_STRM: %s: imh: %p ", TxRxString(txrx), dumpAddr[0]); -#endif goto dumpit; } else if (format == PSM2_LOG_DUMP_MAGIC) diff --git a/psm3/psm_utils.h b/psm3/psm_utils.h index f233a7b..06330e8 100644 --- a/psm3/psm_utils.h +++ b/psm3/psm_utils.h @@ -165,10 +165,6 @@ void psm3_epid_itor_fini(struct psmi_eptab_iterator *itor); /* These functions build the local epid */ // for typical job which includes IPS inter-node comms -#ifdef PSM_OPA -psm2_epid_t psm3_epid_pack_ips(uint16_t lid, uint8_t context, - uint8_t subcontext, uint8_t hfiunit, psmi_naddr128_t addr); -#else psm2_epid_t psm3_epid_pack_ib(uint16_t lid, uint32_t qp_num, psmi_naddr128_t addr); // IPv4 Ethernet (RoCE or UDP/TCP) @@ -181,7 +177,6 @@ psm2_epid_t psm3_epid_pack_ipv4(psmi_naddr128_t ipv4_addr, psm2_epid_t psm3_epid_pack_ipv6(psmi_naddr128_t ipv6_addr, psmi_eth_proto_t protocol, uint32_t context, uint16_t aux_sock); -#endif // for a shm-only job (1 node job) psm2_epid_t psm3_epid_pack_shm(const psm2_uuid_t unique_job_key); @@ -194,9 +189,7 @@ psm2_epid_t psm3_epid_pack_diag(int val); // These functions extract fields/information from the epid uint8_t psm3_epid_addr_fmt(psm2_epid_t epid); -#ifndef PSM_OPA psmi_eth_proto_t psm3_epid_protocol(psm2_epid_t epid); -#endif psm2_nid_t psm3_epid_nid(psm2_epid_t epid); const char *psm3_subnet_epid_subset_fmt(psmi_subnet128_t subnet, int bufno); psmi_subnet128_t psm3_epid_subnet(psm2_epid_t epid); @@ -206,12 +199,8 @@ uint64_t psm3_epid_context(psm2_epid_t epid); #ifdef PSM_SOCKETS uint16_t psm3_epid_aux_socket(psm2_epid_t epid); #endif -#ifdef PSM_OPA -uint64_t psm3_epid_subcontext(psm2_epid_t epid); -#else void psm3_epid_get_av(psm2_epid_t epid, uint16_t *lid, psmi_gid128_t *gid); uint32_t psm3_epid_get_rem_addr(psm2_epid_t epid); -#endif uint16_t psm3_epid_lid(psm2_epid_t epid); uint64_t psm3_epid_hash(psm2_epid_t epid); @@ -259,28 +248,11 @@ PSMI_ALWAYS_INLINE(psm2_nid_t psm3_nid_zeroed_internal(void)) return psm3_epid_zeroed_internal(); } -#ifdef PSM_OPA -// to and from 64b words for inclusion in connection packets -#define PSMI_EPID_LEN (sizeof(uint64_t)*1) // in bytes -#else #define PSMI_EPID_LEN (sizeof(uint64_t)*3) // in bytes -#endif psm2_epid_t psm3_epid_pack_words(uint64_t w0, uint64_t w1, uint64_t w2); -#ifdef PSM_OPA -psm2_epid_t psm3_epid_pack_word(uint64_t w0); -//psm2_epid_t psm2_epid_pack_word(uint64_t w0); -#endif uint64_t psm3_epid_w0(psm2_epid_t epid); uint64_t psm3_epid_w1(psm2_epid_t epid); uint64_t psm3_epid_w2(psm2_epid_t epid); -#ifdef PSM_OPA -// for IPS connect we get 1 extra 64b word -// to hold enough information to reconstruct the full psmi_subnet128_t from the -// epid and this value (for some addr_fmt, the epid can only reconstruct a -// subset of the subnet information) -uint64_t psm3_epid_subnet_extra_word(psmi_subnet128_t subnet); -psmi_subnet128_t psmi_subnet_pack(psm2_epid_t epid, uint64_t extra_word); -#endif /* @@ -293,9 +265,7 @@ const char *psm3_epid_fmt_nid(psm2_epid_t epid, int bufno); const char *psm3_epid_fmt_addr(psm2_epid_t epid, int bufno); const char *psm3_epid_fmt_subnet(psm2_epid_t epid, int bufno); const char *psm3_epid_str_addr_fmt(psm2_epid_t epid); -#ifndef PSM_OPA const char *psm3_epid_str_protocol(psm2_epid_t epid); -#endif const char *psm3_epaddr_get_hostname(psm2_epid_t epid, int bufno); const char *psm3_epaddr_get_name(psm2_epid_t epid, int bufno); psm2_error_t psm3_epid_set_hostname(psm2_nid_t nid, const char *hostname, diff --git a/psm3/ptl_ips/ips_config.h b/psm3/ptl_ips/ips_config.h index 22e84ec..ab7ebc9 100644 --- a/psm3/ptl_ips/ips_config.h +++ b/psm3/ptl_ips/ips_config.h @@ -90,11 +90,6 @@ #define IPS_FAULTINJ_SENDFULL 5000 /* 1 every X pkts no resource at send */ #define IPS_FAULTINJ_SENDFULLCTRL 5000 /* 1 every X pkts no resource at send ctrl */ #define IPS_FAULTINJ_SENDFULLCB 5000 /* 1 every X pkts no resource at send ctrl callback */ -#ifdef PSM_OPA -#define IPS_FAULTINJ_DMALOST 20 /* 1 every X dma writev get lost */ -#define IPS_FAULTINJ_PIOLOST 100 /* 1 every X pio writes get lost */ -#define IPS_FAULTINJ_PIOBUSY 10 /* 1 every X pio sends get busy */ -#endif #define IPS_FAULTINJ_SENDLOST 5000 /* 1 every X pkts dropped at send */ #define IPS_FAULTINJ_SENDPART 10 /* 1 every X pkts partial send */ #define IPS_FAULTINJ_RECVPART 10 /* 1 every X pkts partial recv */ @@ -115,28 +110,6 @@ #endif #endif /* PSM_FI */ -#ifdef PSM_OPA -/* We have to get an MTU of at least 2K, or else this breaks some assumptions - * in the packets that handle tid descriptors - */ -#define IPS_PROTOEXP_MIN_MTU 2048 - -/* TID */ - -/* Max tids a context can support */ -#define IPS_TID_MAX_TIDS 2048 -/* Max tid-session buffer size */ -#define PSM_TIDLIST_BUFSIZE 4096 -/* Max tid-session window size */ -#define PSM_TID_WINSIZE (4*1024*1024) -/* Max number of packets for a single TID flow, fitting tid-session window. - * In PSM2 packet integrity is realized by PSN (Packet Sequence Number), - * which is kept as 11 bits field (for 9B KDETH), - * giving max value 2048 (0 - 2047) */ -#define PSM_TID_MAX_PKTS 2048 -/* Total number of combined pages from the Tid-pair to be merged */ -#define PSM_MAX_NUM_PAGES_IN_TIDPAIR 512 -#endif /* rcv thread */ diff --git a/psm3/ptl_ips/ips_expected_proto.h b/psm3/ptl_ips/ips_expected_proto.h index ed24160..a25fa34 100644 --- a/psm3/ptl_ips/ips_expected_proto.h +++ b/psm3/ptl_ips/ips_expected_proto.h @@ -101,15 +101,9 @@ struct ips_protoexp { const struct ptl *ptl; struct ips_proto *proto; struct psmi_timer_ctrl *timerq; -#ifdef PSM_OPA - struct ips_tid tidc; -#endif struct ips_tf tfc; psm_transfer_type_t ctrl_xfer_type; -#ifdef PSM_OPA - psm_transfer_type_t tid_xfer_type; -#endif struct ips_scbctrl tid_scbc_rv; // pool of SCBs for TID sends // for OPA this includes: TIDEXP, CTS, // EXPTID_COMPLETION @@ -119,17 +113,7 @@ struct ips_protoexp { mpool_t tid_getreq_pool; mpool_t tid_sreq_pool; /* backptr into proto->ep->mq */ mpool_t tid_rreq_pool; /* backptr into proto->ep->mq */ -#ifdef PSM_OPA - struct drand48_data tidflow_drand48_data; -#endif uint32_t tid_flags; -#ifdef PSM_OPA - uint32_t tid_send_fragsize; - uint32_t tid_page_offset_mask; - uint64_t tid_page_mask; - uint32_t hdr_pkt_interval; - struct ips_tidinfo *tid_info; -#endif STAILQ_HEAD(ips_tid_send_pend, /* pending exp. sends */ ips_tid_send_desc) pend_sendq; @@ -157,55 +141,17 @@ struct ips_protoexp { #endif }; -#ifdef PSM_OPA -/* - * TID member list format used in communication. - * Since the compiler does not make sure the bit fields order, - * we use mask and shift defined below. -typedef struct { - uint32_t length:11; // in page unit, max 1024 pages - uint32_t reserved:9; // for future usage - uint32_t tidctrl:2; // hardware defined tidctrl value - uint32_t tid:10; // hardware only support 10bits -} -ips_tid_session_member; - */ -#define IPS_TIDINFO_LENGTH_SHIFT 0 -#define IPS_TIDINFO_LENGTH_MASK 0x7ff -#define IPS_TIDINFO_TIDCTRL_SHIFT 20 -#define IPS_TIDINFO_TIDCTRL_MASK 0x3 -#define IPS_TIDINFO_TID_SHIFT 22 -#define IPS_TIDINFO_TID_MASK 0x3ff - -#define IPS_TIDINFO_GET_LENGTH(tidinfo) \ - (((tidinfo)>>IPS_TIDINFO_LENGTH_SHIFT)&IPS_TIDINFO_LENGTH_MASK) -#define IPS_TIDINFO_GET_TIDCTRL(tidinfo) \ - (((tidinfo)>>IPS_TIDINFO_TIDCTRL_SHIFT)&IPS_TIDINFO_TIDCTRL_MASK) -#define IPS_TIDINFO_GET_TID(tidinfo) \ - (((tidinfo)>>IPS_TIDINFO_TID_SHIFT)&IPS_TIDINFO_TID_MASK) -#endif // This structure is used as CTS payload to describe TID receive // for UD it describes the destination for an RDMA Write // N/A for UDP typedef struct ips_tid_session_list_tag { -#ifndef PSM_OPA // TBD on how we will handle unaligned start/end at receiver uint32_t tsess_srcoff; /* source offset from beginning */ uint32_t tsess_length; /* session length, including start/end */ uint64_t tsess_raddr; /* RDMA virt addr this part of receiver's buffer */ /* already adjusted for srcoff */ uint32_t tsess_rkey; /* rkey for receiver's buffer */ -#else - uint8_t tsess_unaligned_start; /* unaligned bytes at starting */ - uint8_t tsess_unaligned_end; /* unaligned bytes at ending */ - uint16_t tsess_tidcount; /* tid number for the session */ - uint32_t tsess_tidoffset; /* offset in first tid */ - uint32_t tsess_srcoff; /* source offset from beginning */ - uint32_t tsess_length; /* session length, including start/end */ - - uint32_t tsess_list[0]; /* must be last in struct */ -#endif } PACK_SUFFIX ips_tid_session_list; /* @@ -231,9 +177,6 @@ struct ips_tid_send_desc { #if defined(PSM_VERBS) psm3_verbs_mr_t mr; -#elif defined(PSM_OPA) - /* tidflow to send tid traffic */ - struct ips_flow tidflow; #endif /* Iterated during send progress */ @@ -241,21 +184,7 @@ struct ips_tid_send_desc { void *buffer; uint32_t length; /* total length, includint start/end */ -#ifdef PSM_OPA - uint32_t tidbytes; /* bytes sent over tid so far */ - uint32_t remaining_tidbytes; - uint32_t offset_in_tid; /* could be more than page */ - uint32_t remaining_bytes_in_tid; -#endif -#ifdef PSM_OPA - uint16_t frame_send; - uint16_t tid_idx; - uint16_t is_complete; - uint16_t frag_size; - /* bitmap of queued control messages for flow */ - uint16_t ctrl_msg_queued; -#else uint8_t is_complete:1; // all packets for send queued, waiting CQE/response #ifdef PSM_HAVE_RNDV_MOD uint8_t rv_need_err_chk_rdma:1; // need to determine if a retry is required @@ -265,7 +194,6 @@ struct ips_tid_send_desc { #else uint8_t reserved:7; #endif -#endif #if defined(PSM_CUDA) || defined(PSM_ONEAPI) /* As size of cuda_hostbuf is less than equal to window size, @@ -276,24 +204,9 @@ struct ips_tid_send_desc { /* Number of hostbufs attached */ uint8_t cuda_num_buf; #endif -#ifndef PSM_OPA // ips_tid_session_list is fixed sized for UD // N/A to UDP ips_tid_session_list tid_list; -#else - /* - * tid_session_list is 24 bytes, plus 512 tidpair for 2048 bytes, - * so the max possible tid window size mq->hfi_base_window_rv is 4M. - * However, PSM must fit tid grant message into a single transfer - * unit, either PIO or SDMA, PSM will shrink the window accordingly. - */ - uint16_t tsess_tidlist_length; - union { - ips_tid_session_list tid_list; - uint8_t filler[PSM_TIDLIST_BUFSIZE+ - sizeof(ips_tid_session_list)]; - }; -#endif }; #define TIDRECVC_STATE_FREE 0 @@ -307,10 +220,6 @@ struct ips_expected_recv_stats { }; struct ips_tid_recv_desc { -#ifdef PSM_OPA - // could use protoexp->proto->ep->context, but this is more efficient - const psmi_context_t *context; -#endif struct ips_protoexp *protoexp; ptl_arg_t rdescid; /* reciever descid */ @@ -321,12 +230,6 @@ struct ips_tid_recv_desc { ips_scb_t *grantscb; #if defined(PSM_VERBS) psm3_verbs_mr_t mr; // MR for this message window/chunk -#elif defined(PSM_OPA) - /* scb to send tid data completion */ - ips_scb_t *completescb; - - /* tidflow to only send ctrl msg ACK and NAK */ - struct ips_flow tidflow; #endif /* TF protocol state (recv) */ @@ -343,32 +246,12 @@ struct ips_tid_recv_desc { void *buffer; uint32_t recv_msglen; -#ifdef PSM_OPA - uint32_t recv_tidbytes; /* exlcude start/end trim */ -#endif struct ips_expected_recv_stats stats; -#ifndef PSM_OPA // ips_tid_session_list is fixed sized for UD // N/A to UDP ips_tid_session_list tid_list; -#else - /* bitmap of queued control messages for */ - uint16_t ctrl_msg_queued; - /* - * tid_session_list is 24 bytes, plus 512 tidpair for 2048 bytes, - * so the max possible tid window size mq->hfi_base_window_rv is 4M. - * However, PSM must fit tid grant message into a single transfer - * unit, either PIO or SDMA, PSM will shrink the window accordingly. - */ - uint16_t tsess_tidlist_length; - union { - ips_tid_session_list tid_list; - uint8_t filler[PSM_TIDLIST_BUFSIZE+ - sizeof(ips_tid_session_list)]; - }; -#endif }; /* @@ -411,7 +294,6 @@ struct ips_tid_get_request { * Descriptor limits, structure contents of struct psmi_rlimit_mpool for * normal, min and large configurations. */ -#ifndef PSM_OPA #define TID_SENDSESSIONS_LIMITS { \ .env = "PSM3_RDMA_SENDSESSIONS_MAX", \ .descr = "RDMA max send session descriptors", \ @@ -422,18 +304,6 @@ struct ips_tid_get_request { .mode[PSMI_MEMMODE_MINIMAL] = { 1, 1 }, \ .mode[PSMI_MEMMODE_LARGE] = { 512, 16384 } \ } -#else -#define TID_SENDSESSIONS_LIMITS { \ - .env = "PSM3_TID_SENDSESSIONS_MAX", \ - .descr = "Tid max send session descriptors", \ - .env_level = PSMI_ENVVAR_LEVEL_HIDDEN, \ - .minval = 1, \ - .maxval = 1<<30, \ - .mode[PSMI_MEMMODE_NORMAL] = { 256, 8192 }, \ - .mode[PSMI_MEMMODE_MINIMAL] = { 1, 1 }, \ - .mode[PSMI_MEMMODE_LARGE] = { 512, 16384 } \ - } -#endif /* * Expected send support @@ -451,20 +321,6 @@ MOCKABLE(psm3_ips_protoexp_init)(const struct ips_proto *proto, MOCK_DCL_EPILOGUE(psm3_ips_protoexp_init); psm2_error_t psm3_ips_protoexp_fini(struct ips_protoexp *protoexp); -#ifdef PSM_OPA -void -ips_protoexp_do_tf_seqerr(void *vpprotoexp - /* actually: struct ips_protoexp *protoexp */, - void *vptidrecvc - /* actually: struct ips_tid_recv_desc *tidrecvc */, - struct ips_message_header *p_hdr); -void -ips_protoexp_do_tf_generr(void *vpprotoexp - /* actually: struct ips_protoexp *protoexp */, - void *vptidrecvc - /* actually: struct ips_tid_recv_desc *tidrecvc */, - struct ips_message_header *p_hdr); -#endif #ifdef PSM_VERBS int ips_protoexp_handle_immed_data(struct ips_proto *proto, uint64_t conn_ref, @@ -477,14 +333,8 @@ int ips_protoexp_process_err_chk_rdma(struct ips_recvhdrq_event *rcv_ev); int ips_protoexp_process_err_chk_rdma_resp(struct ips_recvhdrq_event *rcv_ev); #endif // PSM_HAVE_RNDV_MOD -#elif defined(PSM_OPA) -int ips_protoexp_data(struct ips_recvhdrq_event *rcv_ev); -int ips_protoexp_recv_tid_completion(struct ips_recvhdrq_event *rcv_ev); #endif //PSM_VERBS -#ifdef PSM_OPA -psm2_error_t ips_protoexp_flow_newgen(struct ips_tid_recv_desc *tidrecvc); -#endif PSMI_ALWAYS_INLINE( void ips_protoexp_unaligned_copy(uint8_t *dst, uint8_t *src, uint16_t len)) diff --git a/psm3/ptl_ips/ips_opp_path_rec.c b/psm3/ptl_ips/ips_opp_path_rec.c index 3dcc3e6..b7bd6a9 100644 --- a/psm3/ptl_ips/ips_opp_path_rec.c +++ b/psm3/ptl_ips/ips_opp_path_rec.c @@ -66,9 +66,6 @@ static psm2_error_t ips_opp_get_path_rec(ips_path_type_t type, struct ips_proto *proto, __be16 slid, __be16 dlid, -#ifdef PSM_OPA - uint16_t desthfi_type, -#endif ips_path_rec_t **ppath_rec) { psm2_error_t err = PSM2_OK; @@ -144,7 +141,6 @@ ips_opp_get_path_rec(ips_path_type_t type, struct ips_proto *proto, proto->epinfo.ep_mtu); path_rec->pr_pkey = ntohs(opp_response.pkey); path_rec->pr_sl = ntohs(opp_response.qos_class_sl); -#ifndef PSM_OPA path_rec->pr_static_rate = opp_response.rate & 0x3f; /* this function is N/A to RoCE. * We don't support routing for IB/OPA so set gid to 0 @@ -152,10 +148,6 @@ ips_opp_get_path_rec(ips_path_type_t type, struct ips_proto *proto, */ path_rec->pr_gid_hi = 0; path_rec->pr_gid_lo = 0; -#else - path_rec->pr_static_ipd = - proto->ips_ipd_delay[opp_response.rate & 0x3f]; -#endif if (path_rec->pr_sl > PSMI_SL_MAX) { err = PSM2_INTERNAL_ERR; @@ -194,12 +186,7 @@ ips_opp_get_path_rec(ips_path_type_t type, struct ips_proto *proto, path_rec->pr_mtu); _HFI_CONNDBG("PKEY: 0x%04x\n", ntohs(opp_response.pkey)); _HFI_CONNDBG("SL: 0x%04x\n", ntohs(opp_response.qos_class_sl)); -#ifdef PSM_OPA - _HFI_CONNDBG("Rate: %x, IPD: %x\n", (opp_response.rate & 0x3f), - path_rec->pr_static_ipd); -#else _HFI_CONNDBG("Rate: %x\n", (opp_response.rate & 0x3f)); -#endif } _HFI_CONNDBG("Timeout Init.: 0x%" PRIx64 " Max: 0x%" PRIx64 "\n", proto->epinfo.ep_timeout_ack, @@ -220,11 +207,7 @@ ips_opp_get_path_rec(ips_path_type_t type, struct ips_proto *proto, static psm2_error_t ips_opp_path_rec(struct ips_proto *proto, __be16 slid, __be16 dlid, -#ifndef PSM_OPA __be64 gid_hi, __be64 gid_lo,// unused here, but must match API signature -#else - uint16_t desthfi_type, -#endif unsigned long timeout, ips_path_grp_t **ppathgrp) { psm2_error_t err = PSM2_OK; @@ -354,9 +337,6 @@ ips_opp_path_rec(struct ips_proto *proto, err = ips_opp_get_path_rec(IPS_PATH_HIGH_PRIORITY, proto, path_slid, path_dlid, -#ifdef PSM_OPA - desthfi_type, -#endif &path); if (err == PSM2_OK) { /* Valid high priority path found */ @@ -386,20 +366,6 @@ ips_opp_path_rec(struct ips_proto *proto, goto fail; } -#ifdef PSM_OPA - /* Once we have the high-priority path, set the partition key */ - if (psmi_hal_set_pkey(proto->ep->context.psm_hw_ctxt, - (uint16_t) pathgrp->pg_path[0][IPS_PATH_HIGH_PRIORITY]->pr_pkey) - != 0) { - err = psm3_handle_error(proto->ep, PSM2_EP_DEVICE_FAILURE, - "Couldn't set device pkey 0x%x for %s port %u: %s", - (int)pathgrp->pg_path[0][IPS_PATH_HIGH_PRIORITY]->pr_pkey, - proto->ep->dev_name, proto->ep->portnum, strerror(errno)); - psmi_free(elid.key); - psmi_free(pathgrp); - goto fail; - } -#endif /* Next setup the bulk paths. If the subnet administrator has misconfigured @@ -415,9 +381,6 @@ ips_opp_path_rec(struct ips_proto *proto, retry_normal_path_res: err = ips_opp_get_path_rec(path_type, proto, path_slid, path_dlid, -#ifdef PSM_OPA - desthfi_type, -#endif &path); if (err != PSM2_OK) { if (path_type == IPS_PATH_NORMAL_PRIORITY) { @@ -461,9 +424,6 @@ ips_opp_path_rec(struct ips_proto *proto, retry_low_path_res: err = ips_opp_get_path_rec(path_type, proto, path_slid, path_dlid, -#ifdef PSM_OPA - desthfi_type, -#endif &path); if (err != PSM2_OK) { if (path_type == IPS_PATH_LOW_PRIORITY) { diff --git a/psm3/ptl_ips/ips_path_rec.c b/psm3/ptl_ips/ips_path_rec.c index 0dd29fe..5f26da6 100644 --- a/psm3/ptl_ips/ips_path_rec.c +++ b/psm3/ptl_ips/ips_path_rec.c @@ -76,25 +76,6 @@ #define DEF_LIMITS_STRING "4294967295:4294967295" #define DEF_LIMITS_VALUE 4294967295 -#ifdef PSM_OPA -static enum psm3_ibv_rate ips_default_hfi_rate(uint16_t hfi_type) -{ - enum psm3_ibv_rate rate; - - switch (hfi_type) { - case PSMI_HFI_TYPE_OPA1: - rate = PSM3_IBV_RATE_100_GBPS; - break; - case PSMI_HFI_TYPE_OPA2: - rate = PSM3_IBV_RATE_120_GBPS; - break; - default: - rate = PSM3_IBV_RATE_MAX; - } - - return rate; -} -#endif // PSM_OPA // unfortunately ibv_rate_to_mult and mult_to_ibv_rate have a bug as they // omit 100g rate and some others, so we create our own @@ -262,11 +243,7 @@ uint8_t psm3_timeout_usec_to_mult(uint64_t timeout_us) static psm2_error_t ips_none_get_path_rec(struct ips_proto *proto, __be16 slid, __be16 dlid, -#ifndef PSM_OPA __be64 gid_hi, __be64 gid_lo, -#else - uint16_t desthfi_type, -#endif unsigned long timeout, ips_path_rec_t **ppath_rec) { psm2_error_t err = PSM2_OK; @@ -280,12 +257,8 @@ ips_none_get_path_rec(struct ips_proto *proto, * endian CPU, this will put low bits earlier in string and cause * quicker discovery of differences when doing strcmp to sort/search */ -#ifndef PSM_OPA // TBD - slid same until have dispersive LMC-like, could just use dest snprintf(eplid, sizeof(eplid), "%x_%"PRIx64"_%"PRIx64"_%x", slid, (uint64_t)gid_lo, (uint64_t)gid_hi, dlid); -#else - snprintf(eplid, sizeof(eplid), "%x_%x", slid, dlid); -#endif elid.key = eplid; hsearch_r(elid, FIND, &epath, &proto->ips_path_rec_hash); @@ -306,19 +279,9 @@ ips_none_get_path_rec(struct ips_proto *proto, path_rec->pr_mtu = proto->epinfo.ep_mtu; path_rec->pr_pkey = proto->epinfo.ep_pkey; path_rec->pr_sl = proto->epinfo.ep_sl; -#ifndef PSM_OPA path_rec->pr_gid_hi = gid_hi; /* __be64 */ path_rec->pr_gid_lo = gid_lo; /* __be64 */ path_rec->pr_static_rate = proto->epinfo.ep_link_rate; -#else - /* Determine the IPD based on our local link rate and default link rate for - * remote hfi type. - */ - path_rec->pr_static_ipd = - proto->ips_ipd_delay[ips_default_hfi_rate(desthfi_type)]; - - _HFI_CCADBG("pr_static_ipd = %d\n", (int) path_rec->pr_static_ipd); -#endif if (path_rec->pr_sl > PSMI_SL_MAX) { err = PSM2_INTERNAL_ERR; @@ -351,11 +314,7 @@ ips_none_get_path_rec(struct ips_proto *proto, static psm2_error_t ips_none_path_rec(struct ips_proto *proto, __be16 slid, __be16 dlid, -#ifndef PSM_OPA __be64 gid_hi, __be64 gid_lo, -#else - uint16_t desthfi_type, -#endif unsigned long timeout, ips_path_grp_t **ppathgrp) { psm2_error_t err = PSM2_OK; @@ -380,12 +339,8 @@ ips_none_path_rec(struct ips_proto *proto, * endian CPU, this will put low bits earlier in string and cause * quicker discovery of differences when doing strcmp to sort/search */ -#ifndef PSM_OPA // TBD - slid same until have dispersive LMC-like, could just use dest snprintf(eplid, sizeof(eplid), "%x_%"PRIx64"_%"PRIx64"_%x", slid, (uint64_t)gid_lo, (uint64_t)gid_hi, dlid); -#else - snprintf(eplid, sizeof(eplid), "%x_%x", slid, dlid); -#endif elid.key = eplid; hsearch_r(elid, FIND, &epath, &proto->ips_path_grp_hash); @@ -443,11 +398,7 @@ ips_none_path_rec(struct ips_proto *proto, err = ips_none_get_path_rec(proto, path_slid, path_dlid, -#ifndef PSM_OPA gid_hi, gid_lo, -#else - desthfi_type, -#endif timeout, &path); if (err != PSM2_OK) { psmi_free(elid.key); @@ -474,19 +425,12 @@ ips_none_path_rec(struct ips_proto *proto, pathgrp->pg_path[0][IPS_PATH_NORMAL_PRIORITY] = path; pathgrp->pg_path[0][IPS_PATH_LOW_PRIORITY] = path; } -#ifndef PSM_OPA PSM2_LOG_MSG("path %p slid %hu dlid %hu gid %0x"PRIx64":%"PRIx64"\n", path, __be16_to_cpu(path->pr_slid), __be16_to_cpu(path->pr_dlid), __be64_to_cpu(path->pr_gid_hi), __be64_to_cpu(path->pr_gid_lo)); -#else - PSM2_LOG_MSG("path %p slid %hu dlid %hu %hu\n", - path, - __be16_to_cpu(path->pr_slid), - __be16_to_cpu(path->pr_dlid)); -#endif } @@ -583,16 +527,6 @@ static psm2_error_t ips_none_path_rec_init(struct ips_proto *proto) proto->ibta.get_path_rec = ips_none_path_rec; proto->ibta.fini = NULL; -#ifdef PSM_OPA - /* With no path records queries set pkey manually */ - if (psmi_hal_set_pkey(proto->ep->context.psm_hw_ctxt, - (uint16_t) proto->ep->network_pkey) != 0) { - err = psm3_handle_error(proto->ep, PSM2_EP_DEVICE_FAILURE, - "Couldn't set device pkey 0x%x for %s port %u: %s", - (int)proto->ep->network_pkey, - proto->ep->dev_name, proto->ep->portnum, strerror(errno)); - } -#endif return err; } diff --git a/psm3/ptl_ips/ips_path_rec.h b/psm3/ptl_ips/ips_path_rec.h index c326fa2..ebca755 100644 --- a/psm3/ptl_ips/ips_path_rec.h +++ b/psm3/ptl_ips/ips_path_rec.h @@ -229,35 +229,13 @@ typedef struct ips_path_rec { __be16 pr_dlid; uint16_t pr_pkey; uint8_t pr_sl; -#ifdef PSM_OPA - uint8_t pr_pad[3]; // for alignment - uint16_t pr_static_ipd; /* Static rate IPD from path record */ -#else uint8_t pr_static_rate; // psm3_ibv_rate enum __be64 pr_gid_hi; // for ethernet, has IPv4 or IPv6 __be64 pr_gid_lo; // addr in IPv6 style -#endif uint32_t pr_mtu; /* PSM payload in bytes, <= Path's MTU */ // TBD - could reduce to 2 bytes by storing // as number of dwords instead of bytes union { -#ifdef PSM_OPA - struct { - // 64b aligned at start of struct - /* IBTA CCA parameters per path */ - /* CCA divisor [14:15] in CCT entry */ - uint8_t pr_cca_divisor; - uint8_t pr_pad[3]; - /* The current active IPD. max(static,cct) */ - uint16_t pr_active_ipd; - /* CCA table index */ - uint16_t pr_ccti; - /* Congestion timer for epr_ccti increment. */ - psmi_timer *pr_timer_cca; - /* for global info */ - struct ips_proto *pr_proto; - } PACK_SUFFIX opa; -#endif /* PSM_OPA */ #ifdef PSM_VERBS // each path_rec is shared for all remote processes on a // a given node. So this is a convenient place to have diff --git a/psm3/ptl_ips/ips_proto.c b/psm3/ptl_ips/ips_proto.c index e5cec06..0731b5a 100644 --- a/psm3/ptl_ips/ips_proto.c +++ b/psm3/ptl_ips/ips_proto.c @@ -75,11 +75,7 @@ #define CTRL_MSG_NAK_QUEUED 0x0002 #define CTRL_MSG_BECN_QUEUED 0x0004 #define CTRL_MSG_ERR_CHK_QUEUED 0x0008 -#ifdef PSM_OPA -#define CTRL_MSG_ERR_CHK_GEN_QUEUED 0x0010 -#else // reserved 0x0010 -#endif #define CTRL_MSG_CONNECT_REQUEST_QUEUED 0x0020 #define CTRL_MSG_CONNECT_REPLY_QUEUED 0x0040 #define CTRL_MSG_DISCONNECT_REQUEST_QUEUED 0x0080 @@ -94,8 +90,6 @@ static void ctrlq_init(struct ips_ctrlq *ctrlq, struct ips_proto *proto); #ifdef PSM_HAVE_REG_MR static psm2_error_t proto_sdma_init(struct ips_proto *proto); -#elif defined(PSM_OPA) -static psm2_error_t proto_sdma_init(struct ips_proto *proto); #endif static psm2_error_t ips_proto_register_stats(struct ips_proto *proto); @@ -221,15 +215,6 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl, proto->flags |= IPS_PROTO_FLAG_LOOPBACK; } -#ifdef PSM_OPA - /* for SELINUX, psm3_ips_ibta_init will set the driver pkey which - * causes hfi1 driver to recompute the jkey, so - * we need to refetch it here - */ - /* Update JKey if necessary */ - if (getenv("PSM3_SELINUX")) - proto->epinfo.ep_jkey = psmi_hal_get_jkey(ep); -#endif { /* Disable coalesced ACKs? */ @@ -245,32 +230,13 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl, /* * Initialize SDMA, otherwise, turn on all PIO. */ -#ifdef PSM_OPA - if (psmi_hal_has_cap(PSM_HAL_CAP_SDMA)) { - if ((err = proto_sdma_init(proto))) - goto fail; - } else { - proto->flags |= IPS_PROTO_FLAG_SPIO; - proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking = - ~0U; - } -#else // initialize sdma after PSM3_MR_CACHE_MODE proto->flags |= IPS_PROTO_FLAG_SPIO; -#endif /* * Setup the protocol wide short message ep flow. */ -#ifdef PSM_OPA - if (proto->flags & IPS_PROTO_FLAG_SDMA) { - proto->msgflowid = EP_FLOW_GO_BACK_N_DMA; - } else { - proto->msgflowid = EP_FLOW_GO_BACK_N_PIO; - } -#else proto->msgflowid = EP_FLOW_GO_BACK_N_PIO; -#endif /* * Clone sendreq mpool configuration for pend sends config @@ -291,26 +257,6 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl, } } -#ifdef PSM_OPA - /* - * Create a pool of CCA timers for path_rec. The timers should not - * exceed the scb number num_of_send_desc(default 4K). - */ - { - uint32_t chunks, maxsz; - - chunks = 256; - maxsz = num_of_send_desc; - - proto->timer_pool = - psm3_mpool_create(sizeof(struct psmi_timer), chunks, maxsz, - 0, DESCRIPTORS, NULL, NULL); - if (proto->timer_pool == NULL) { - err = PSM2_NO_MEMORY; - goto fail; - } - } -#endif /* * Register ips protocol statistics @@ -371,17 +317,11 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl, // protoexp implements RDMA for UD and TID for STL100 native. N/A to UDP // when proto->protoexp is NULL, we will not attempt to use TID nor RDMA -#ifdef PSM_OPA - if (protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED) { - proto->scbc_rv = NULL; - } else { -#else { (void)protoexp_flags; // for UD, even when RDMA is enabled, we may fall back to LONG_DATA // in which case we want the scbc_rv scb's so we don't exhaust the // scbc_egr pool -#endif proto->scbc_rv = (struct ips_scbctrl *) psmi_calloc(proto->ep, DESCRIPTORS, 1, sizeof(struct ips_scbctrl)); @@ -415,41 +355,6 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl, proto->protoexp = NULL; } -#ifdef PSM_OPA -// TBD - put in HAL specific protoexp_init routine - // only used for STL100 native mode - /* - * Parse the tid error settings from the environment. - * : - */ - { - int tvals[2]; - char *tid_err; - union psmi_envvar_val env_tiderr; - - tid_err = "-1:0"; /* no tiderr warnings, never exits */ - tvals[0] = -1; - tvals[1] = 0; - - if (!psm3_getenv("PSM3_TID_ERROR", - "Tid error control ", - PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR, - (union psmi_envvar_val)tid_err, &env_tiderr)) { - /* not using default values */ - tid_err = env_tiderr.e_str; - psm3_parse_str_tuples(tid_err, 2, tvals); - } - if (tvals[0] >= 0) - proto->tiderr_warn_interval = sec_2_cycles(tvals[0]); - else - proto->tiderr_warn_interval = UINT64_MAX; - proto->tiderr_max = tvals[1]; - _HFI_PRDBG("Tid error control: warning every %d secs%s, " - "fatal error after %d tid errors%s\n", - tvals[0], (tvals[0] < 0) ? " (no warnings)" : "", - tvals[1], (tvals[1] == 0) ? " (never fatal)" : ""); - } -#endif /* Active Message interface. AM requests compete with MQ for eager * buffers, since request establish the amount of buffering in the @@ -471,9 +376,6 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl, is_gpudirect_enabled = psmi_parse_gpudirect(); gpudirect_rdma_send_limit = psmi_parse_gpudirect_rdma_send_limit(0); gpudirect_rdma_recv_limit = psmi_parse_gpudirect_rdma_recv_limit(0); -#ifdef PSM_OPA - // driver capability affects driver API, so always check capability -#endif if (psmi_hal_has_cap(PSM_HAL_CAP_GPUDIRECT)) is_driver_gpudirect_enabled = 1; @@ -489,20 +391,10 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl, is_gpudirect_enabled = 0; gpudirect_rdma_send_limit = gpudirect_rdma_recv_limit = 0; } else if ( -#ifdef PSM_OPA // for OPA need SDMA and TID RDMA - /* All pio, No SDMA*/ - (proto->flags & IPS_PROTO_FLAG_SPIO) || - !(protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED) || -#else // for UD and UDP, allow any RDMA mode, no SDMA (always PIO) -#endif PSMI_IS_DRIVER_GPUDIRECT_DISABLED) { err = psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, -#ifdef PSM_OPA - "Unable to start run, Requires SDMA, TID recv and hfi1 driver with GPU-Direct feature enabled.\n"); -#else "Unable to start run, PSM3_GPUDIRECT requires rv module with CUDA support.\n"); -#endif } else if (!(protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED)) { // only GDR Copy and GPU Send DMA allowed gpudirect_rdma_send_limit = gpudirect_rdma_recv_limit = 0; @@ -580,11 +472,7 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl, union psmi_envvar_val env_prefetch_limit; psm3_getenv("PSM3_CUDA_PREFETCH_LIMIT", -#ifdef PSM_OPA - "How many TID windows to prefetch at RTS time(default is 2)", -#else "How many RDMA windows to prefetch at RTS time(default is 2)", -#endif PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT_FLAGS, (union psmi_envvar_val)CUDA_WINDOW_PREFETCH_DEFAULT, &env_prefetch_limit); @@ -961,60 +849,13 @@ psm3_ips_proto_fini(struct ips_proto *proto, int force, uint64_t timeout_in) psm3_mpool_destroy(proto->pend_sends_pool); -#ifdef PSM_OPA - psm3_mpool_destroy(proto->timer_pool); - psmi_free(proto->sdma_scb_queue); -#endif fail: proto->t_fini = proto->t_init = 0; return err; } -#ifdef PSM_OPA -static -psm2_error_t -proto_sdma_init(struct ips_proto *proto) -{ - union psmi_envvar_val env_sdma, env_hfiegr; - psm2_error_t err = PSM2_OK; - - /* - * Only initialize if RUNTIME_SDMA is enabled. - */ - psmi_assert_always(psmi_hal_has_cap(PSM_HAL_CAP_SDMA)); - - psm3_getenv("PSM3_SDMA", - "hfi send dma flags (0 disables send dma, 2 disables send pio, " - "1 for both sdma/spio, default 1)", - PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, - (union psmi_envvar_val)1, &env_sdma); - if (env_sdma.e_uint == 0) - proto->flags |= IPS_PROTO_FLAG_SPIO; - else if (env_sdma.e_uint == 2) - proto->flags |= IPS_PROTO_FLAG_SDMA; - - if (!(proto->flags & (IPS_PROTO_FLAG_SDMA | IPS_PROTO_FLAG_SPIO))) { - /* use both spio and sdma */ - if (!psm3_getenv("PSM3_MQ_EAGER_SDMA_THRESH", - "hfi pio-to-sdma eager switchover threshold", - PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, - (union psmi_envvar_val) proto->iovec_thresh_eager, - &env_hfiegr)) { - proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking = - env_hfiegr.e_uint; - } - } else if (proto->flags & IPS_PROTO_FLAG_SDMA) { /* all sdma */ - proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking = - 0; - } else if (proto->flags & IPS_PROTO_FLAG_SPIO) { /* all spio */ - proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking = - ~0U; - } - - return err; -} -#elif defined(PSM_HAVE_REG_MR) +#if defined(PSM_HAVE_REG_MR) static psm2_error_t proto_sdma_init(struct ips_proto *proto) @@ -1083,10 +924,6 @@ void ctrlq_init(struct ips_ctrlq *ctrlq, struct ips_proto *proto) proto->message_type_to_mask[OPCODE_NAK] = CTRL_MSG_NAK_QUEUED; proto->message_type_to_mask[OPCODE_BECN] = CTRL_MSG_BECN_QUEUED; proto->message_type_to_mask[OPCODE_ERR_CHK] = CTRL_MSG_ERR_CHK_QUEUED; -#ifdef PSM_OPA - proto->message_type_to_mask[OPCODE_ERR_CHK_GEN] = - CTRL_MSG_ERR_CHK_GEN_QUEUED; -#endif proto->message_type_to_mask[OPCODE_CONNECT_REQUEST] = CTRL_MSG_CONNECT_REQUEST_QUEUED; proto->message_type_to_mask[OPCODE_CONNECT_REPLY] = @@ -1150,10 +987,6 @@ static __inline__ void _build_ctrl_message(struct ips_proto *proto, p_hdr->lrh[0] = __cpu_to_be16(HFI_LRH_BTH | ((ctrl_path->pr_sl & HFI_LRH_SL_MASK) << HFI_LRH_SL_SHIFT) -#ifdef PSM_OPA - | ((proto->sl2sc[ctrl_path->pr_sl] & - HFI_LRH_SC_MASK) << HFI_LRH_SC_SHIFT) -#endif ); p_hdr->lrh[1] = dlid; p_hdr->lrh[2] = ips_proto_bytes_to_lrh2_be(proto, @@ -1164,30 +997,8 @@ static __inline__ void _build_ctrl_message(struct ips_proto *proto, p_hdr->bth[0] = __cpu_to_be32(ctrl_path->pr_pkey | (message_type << HFI_BTH_OPCODE_SHIFT)); -#ifdef PSM_OPA - /* If flow is congested then generate a BECN for path. */ - if_pf(flow->flags & IPS_FLOW_FLAG_GEN_BECN) { - p_hdr->bth[1] = __cpu_to_be32(ipsaddr->opa.context | - ipsaddr->opa.subcontext << - HFI_BTH_SUBCTXT_SHIFT | flow-> - flowid << HFI_BTH_FLOWID_SHIFT - | proto->epinfo. - ep_baseqp << HFI_BTH_QP_SHIFT | - 1 << HFI_BTH_BECN_SHIFT); - flow->flags &= ~IPS_FLOW_FLAG_GEN_BECN; - } - else { - p_hdr->bth[1] = __cpu_to_be32(ipsaddr->opa.context | - ipsaddr->opa.subcontext << - HFI_BTH_SUBCTXT_SHIFT | flow-> - flowid << HFI_BTH_FLOWID_SHIFT - | proto->epinfo. - ep_baseqp << HFI_BTH_QP_SHIFT); - } -#else p_hdr->bth[1] = __cpu_to_be32(flow->flowid << HFI_BTH_FLOWID_SHIFT); flow->flags &= ~IPS_FLOW_FLAG_GEN_BECN; -#endif // PSM_OPA /* p_hdr->bth[2] already set by caller, or don't care */ /* p_hdr->ack_seq_num already set by caller, or don't care */ @@ -1198,11 +1009,7 @@ static __inline__ void _build_ctrl_message(struct ips_proto *proto, p_hdr->khdr.kdeth0 = __cpu_to_le32( (ctrlscb->scb_flags & IPS_SEND_FLAG_INTR) | (IPS_PROTO_VERSION << HFI_KHDR_KVER_SHIFT)); -#ifndef PSM_OPA p_hdr->khdr.kdeth1 = 0; -#else - p_hdr->khdr.kdeth1 = __cpu_to_le32(proto->epinfo.ep_jkey); -#endif return; } @@ -1257,14 +1064,7 @@ psm3_ips_proto_timer_ctrlq_callback(struct psmi_timer *timer, uint64_t t_cyc_exp } else { psmi_assert(err == PSM2_EP_NO_RESOURCES); -#ifdef PSM_OPA - if (proto->flags & IPS_PROTO_FLAG_SDMA) - proto->stats.sdma_busy_cnt++; - else - proto->stats.pio_busy_cnt++; -#else proto->stats.pio_busy_cnt++; -#endif /* re-request a timer expiration */ psmi_timer_request(proto->timerq, &ctrlq->ctrlq_timer, PSMI_TIMER_PRIO_0); @@ -1374,14 +1174,7 @@ psm3_ips_proto_send_ctrl_message(struct ips_flow *flow, uint8_t message_type, if (err != PSM2_EP_NO_RESOURCES) return err; -#ifdef PSM_OPA - if (proto->flags & IPS_PROTO_FLAG_SDMA) - proto->stats.sdma_busy_cnt++; - else - proto->stats.pio_busy_cnt++; -#else proto->stats.pio_busy_cnt++; -#endif /* to limit the performance penalty when transfer_frame is out * of resources, we can queue a modest number of zero payload @@ -1448,9 +1241,6 @@ void MOCKABLE(psm3_ips_proto_flow_enqueue)(struct ips_flow *flow, ips_scb_t *scb ips_scb_prepare_flow_inner(proto, ipsaddr, flow, scb); if ((proto->flags & IPS_PROTO_FLAG_CKSUM) && -#ifdef PSM_OPA - (scb->tidctrl == 0) && -#endif (scb->nfrag == 1)) { scb->ips_lrh.flags |= IPS_SEND_FLAG_PKTCKSUM; ips_do_cksum(proto, &scb->ips_lrh, @@ -1514,9 +1304,6 @@ psm3_ips_proto_flow_flush_pio(struct ips_flow *flow, int *nflushed) if_pf(flow->credits <= 0 #ifdef PSM_BYTE_FLOW_CREDITS || flow->credit_bytes <= 0 -#endif -#ifdef PSM_OPA - || (flow->flags & IPS_FLOW_FLAG_CONGESTED) #endif ) { if (nflushed) @@ -1680,197 +1467,6 @@ psm3_ips_proto_flow_flush_pio(struct ips_flow *flow, int *nflushed) return err; } -#ifdef PSM_OPA -/* - * Flush all packets queued up on a flow via send DMA. - * - * Recoverable errors: - * PSM2_OK: Able to flush entire pending queue for DMA. - * PSM2_OK_NO_PROGRESS: Flushed at least 1 but not all pending packets for DMA. - * PSM2_EP_NO_RESOURCES: No scb's available to handle unaligned packets - * or writev returned a recoverable error (no mem for - * descriptors, dma interrupted or no space left in dma - * queue). - * - * Unrecoverable errors: - * PSM2_EP_DEVICE_FAILURE: Unexpected error calling writev(), chip failure, - * rxe/txe parity error. - * PSM2_EP_NO_NETWORK: No network, no lid, ... - */ -psm2_error_t -ips_proto_flow_flush_dma(struct ips_flow *flow, int *nflushed) -{ - struct ips_proto *proto = ((psm2_epaddr_t) (flow->ipsaddr))->proto; - struct ips_scb_pendlist *scb_pend = &flow->scb_pend; - ips_scb_t *scb = NULL; - psm2_error_t err = PSM2_OK; - int nsent = 0; - - psmi_assert(!SLIST_EMPTY(scb_pend)); - - /* Out of credits - ACKs/NAKs reclaim recredit or congested flow */ - if_pf(flow->credits <= 0 -#ifdef PSM_BYTE_FLOW_CREDITS - || flow->credit_bytes <= 0 -#endif -#ifdef PSM_OPA - || (flow->flags & IPS_FLOW_FLAG_CONGESTED) -#endif - ) { - if (nflushed) - *nflushed = 0; - return PSM2_EP_NO_RESOURCES; - } - - // scb will descrbe header needed, which may be TID - err = psmi_hal_dma_send_pending_scbs(proto, flow, scb_pend, &nsent); - if (err != PSM2_OK && err != PSM2_EP_NO_RESOURCES && - err != PSM2_OK_NO_PROGRESS) - goto fail; - - if (nsent > 0) { - uint64_t t_cyc = get_cycles(); - int i = 0; - /* - * inflight counter proto->iovec_cntr_next_inflight should not drift - * from completion counter proto->iovec_cntr_last_completed away too - * far because we only have very small scb counter compared with - * uint32_t counter value. - */ -#ifdef PSM_DEBUG - flow->scb_num_pending -= nsent; -#endif - SLIST_FOREACH(scb, scb_pend, next) { - if (++i > nsent) - break; - - PSM2_LOG_PKT_STRM(PSM2_LOG_TX,&scb->ips_lrh,"PKT_STRM: (dma)"); - - scb->scb_flags &= ~IPS_SEND_FLAG_PENDING; - scb->ack_timeout = - scb->nfrag * proto->epinfo.ep_timeout_ack; - scb->abs_timeout = - scb->nfrag * proto->epinfo.ep_timeout_ack + t_cyc; - - psmi_assert(proto->sdma_scb_queue - [proto->sdma_fill_index] == NULL); - proto->sdma_scb_queue[proto->sdma_fill_index] = scb; - scb->sdma_outstanding++; - - proto->sdma_avail_counter--; - proto->sdma_fill_index++; - if (proto->sdma_fill_index == proto->sdma_queue_size) - proto->sdma_fill_index = 0; - - /* Flow credits can temporarily go to negative for - * packets tracking purpose, because we have sdma - * chunk processing which can't send exact number - * of packets as the number of credits. - */ - flow->credits -= scb->nfrag; -#ifdef PSM_BYTE_FLOW_CREDITS - flow->credit_bytes -= scb->chunk_size; - _HFI_VDBG("after DMA send: credits %d bytes %d sent %u bytes %u\n", - flow->credits, flow->credit_bytes, - scb->nfrag, scb->chunk_size); -#else - _HFI_VDBG("after DMA send: credits %d sent %u bytes %u\n", - flow->credits, - scb->nfrag, scb->chunk_size); -#endif - } - SLIST_FIRST(scb_pend) = scb; - } - - if (SLIST_FIRST(scb_pend) != NULL) { - psmi_assert(flow->scb_num_pending > 0); - - switch (flow->protocol) { - case PSM_PROTOCOL_TIDFLOW: -#ifndef PSM_OPA - // for UD we use RC QP instead of STL100's TIDFLOW HW - // UDP has no RDMA - psmi_assert_always(0); // we don't allocate ips_flow for TID -#else - // some tidflow specific cleanup - /* For Tidflow we can cancel the ack timer if we have flow credits - * available and schedule the send timer. If we are out of flow - * credits then the ack timer is scheduled as we are waiting for - * an ACK to reclaim credits. This is required since multiple - * tidflows may be active concurrently. - */ - if (flow->credits > 0 -#ifdef PSM_BYTE_FLOW_CREDITS - && flow->credit_bytes > 0 -#endif - ) { - /* Cancel ack timer and reschedule send timer. Increment - * sdma_busy_cnt as this really is DMA buffer exhaustion. - */ - psmi_timer_cancel(proto->timerq, - flow->timer_ack); - psmi_timer_request(proto->timerq, - flow->timer_send, - get_cycles() + - (proto->timeout_send << 1)); - proto->stats.sdma_busy_cnt++; - } else { - /* Re-instate ACK timer to reap flow credits */ - psmi_timer_request(proto->timerq, - flow->timer_ack, - get_cycles() + - (proto->epinfo. - ep_timeout_ack >> 2)); - } -#endif // ! PSM_OPA - - break; - case PSM_PROTOCOL_GO_BACK_N: - default: - if (flow->credits > 0 -#ifdef PSM_BYTE_FLOW_CREDITS - && flow->credit_bytes > 0 -#endif - ) { - /* Schedule send timer and increment sdma_busy_cnt */ - psmi_timer_request(proto->timerq, - flow->timer_send, - get_cycles() + - (proto->timeout_send << 1)); - proto->stats.sdma_busy_cnt++; - } else { - /* Schedule ACK timer to reap flow credits */ - psmi_timer_request(proto->timerq, - flow->timer_ack, - get_cycles() + - (proto->epinfo. - ep_timeout_ack >> 2)); - } - break; - } - } else { - /* Schedule ack timer */ - psmi_timer_cancel(proto->timerq, flow->timer_send); - psmi_timer_request(proto->timerq, flow->timer_ack, - get_cycles() + proto->epinfo.ep_timeout_ack); - } - - /* We overwrite error with its new meaning for flushing packets */ - if (nsent > 0) - if (scb) - err = PSM2_OK_NO_PROGRESS; /* partial flush */ - else - err = PSM2_OK; /* complete flush */ - else - err = PSM2_EP_NO_RESOURCES; /* no flush at all */ - -fail: - if (nflushed) - *nflushed = nsent; - - return err; -} -#endif // PSM_OPA #ifdef PSM_HAVE_SDMA /* @@ -1984,21 +1580,10 @@ psm3_ips_proto_timer_ack_callback(struct psmi_timer *current_timer, SLIST_FIRST(&flow->scb_pend)->seq_num; if (flow->protocol == PSM_PROTOCOL_TIDFLOW) { -#ifndef PSM_OPA // for UD we use RC QP instead of STL100's TIDFLOW HW // UDP has no RDMA psmi_assert_always(0); // we don't allocate ips_flow for TID message_type = OPCODE_ERR_CHK; // keep KlockWorks happy -#else - message_type = OPCODE_ERR_CHK_GEN; - err_chk_seq.psn_seq -= 1; - /* Receive descriptor index */ - ctrlscb.ips_lrh.data[0].u64 = - scb->tidsendc->rdescid.u64; - /* Send descriptor index */ - ctrlscb.ips_lrh.data[1].u64 = - scb->tidsendc->sdescid.u64; -#endif } else { PSM2_LOG_MSG("sending ERR_CHK message"); message_type = OPCODE_ERR_CHK; @@ -2027,24 +1612,6 @@ psm3_ips_proto_timer_send_callback(struct psmi_timer *current_timer, uint64_t current) { struct ips_flow *flow = ((ips_scb_t *)current_timer->context)->flow; -#ifdef PSM_OPA - struct ips_proto *proto = ((psm2_epaddr_t) (flow->ipsaddr))->proto; - - /* If flow is marked as congested adjust injection rate - see process nak - * when a congestion NAK is received. - */ - if_pf(flow->flags & IPS_FLOW_FLAG_CONGESTED) { - - /* Clear congestion flag and decrease injection rate */ - flow->flags &= ~IPS_FLOW_FLAG_CONGESTED; - if ((flow->path->opa.pr_ccti + - proto->cace[flow->path->pr_sl].ccti_increase) <= - proto->ccti_limit) - ips_cca_adjust_rate(flow->path, - proto->cace[flow->path->pr_sl]. - ccti_increase); - } -#endif if (!SLIST_EMPTY(&flow->scb_pend)) flow->flush(flow, NULL); @@ -2052,83 +1619,6 @@ psm3_ips_proto_timer_send_callback(struct psmi_timer *current_timer, return PSM2_OK; } -#ifdef PSM_OPA -psm2_error_t ips_cca_adjust_rate(ips_path_rec_t *path_rec, int cct_increment) -{ - struct ips_proto *proto = path_rec->opa.pr_proto; - - /* Increment/decrement ccti for path */ - psmi_assert_always(path_rec->opa.pr_ccti >= - proto->cace[path_rec->pr_sl].ccti_min); - path_rec->opa.pr_ccti += cct_increment; - - /* Determine new active IPD. */ -#if _HFI_DEBUGGING - uint16_t prev_ipd = 0; - uint16_t prev_divisor = 0; - if (_HFI_CCADBG_ON) { - prev_ipd = path_rec->opa.pr_active_ipd; - prev_divisor = path_rec->opa.pr_cca_divisor; - } -#endif - if ((path_rec->pr_static_ipd) && - ((path_rec->pr_static_ipd + 1) > - (proto->cct[path_rec->opa.pr_ccti] & CCA_IPD_MASK))) { - path_rec->opa.pr_active_ipd = path_rec->pr_static_ipd + 1; - path_rec->opa.pr_cca_divisor = 0; - } else { - path_rec->opa.pr_active_ipd = - proto->cct[path_rec->opa.pr_ccti] & CCA_IPD_MASK; - path_rec->opa.pr_cca_divisor = - proto->cct[path_rec->opa.pr_ccti] >> CCA_DIVISOR_SHIFT; - } - -#if _HFI_DEBUGGING - if (_HFI_CCADBG_ON) { - _HFI_CCADBG_ALWAYS("CCA: %s injection rate to <%x.%x> from <%x.%x>\n", - (cct_increment > 0) ? "Decreasing" : "Increasing", - path_rec->opa.pr_cca_divisor, path_rec->opa.pr_active_ipd, - prev_divisor, prev_ipd); - } -#endif - - /* Reschedule CCA timer if this path is still marked as congested */ - if (path_rec->opa.pr_ccti > proto->cace[path_rec->pr_sl].ccti_min) { - if (path_rec->opa.pr_timer_cca == NULL) { - path_rec->opa.pr_timer_cca = - (struct psmi_timer *)psm3_mpool_get(proto-> - timer_pool); - psmi_assert(path_rec->opa.pr_timer_cca != NULL); - psmi_timer_entry_init(path_rec->opa.pr_timer_cca, - ips_cca_timer_callback, path_rec); - } - psmi_timer_request(proto->timerq, - path_rec->opa.pr_timer_cca, - get_cycles() + - proto->cace[path_rec->pr_sl]. - ccti_timer_cycles); - } else if (path_rec->opa.pr_timer_cca) { - psm3_mpool_put(path_rec->opa.pr_timer_cca); - path_rec->opa.pr_timer_cca = NULL; - } - - return PSM2_OK; -} - -psm2_error_t -ips_cca_timer_callback(struct psmi_timer *current_timer, uint64_t current) -{ - ips_path_rec_t *path_rec = (ips_path_rec_t *) current_timer->context; - - /* Increase injection rate for flow. Decrement CCTI */ - if (path_rec->opa.pr_ccti > path_rec->opa.pr_proto->cace[path_rec->pr_sl].ccti_min) - return ips_cca_adjust_rate(path_rec, -1); - - psm3_mpool_put(path_rec->opa.pr_timer_cca); - path_rec->opa.pr_timer_cca = NULL; - return PSM2_OK; -} -#endif // PSM_OPA #ifdef PSM_VERBS static uint64_t verbs_ep_send_num_free(void *context) @@ -2161,11 +1651,6 @@ ips_proto_register_stats(struct ips_proto *proto) * * We put a (**) in the output of those stats that "should never happen" */ -#ifdef PSM_OPA - uint64_t *pio_stall_cnt = NULL; - - psmi_hal_get_pio_stall_cnt(proto->ep->context.psm_hw_ctxt,&pio_stall_cnt); -#endif struct psmi_stats_entry entries[] = { PSMI_STATS_DECLU64("pio_busy_count", @@ -2190,17 +1675,7 @@ ips_proto_register_stats(struct ips_proto *proto) #ifdef PSM_HAVE_SDMA /* SDMA statistics only applicable to HALs with send DMA */ -#ifdef PSM_OPA - /* SDMA Throttling by kernel */ - PSMI_STATS_DECLU64("sdma_busy_cnt", - &proto->stats.sdma_busy_cnt), -#endif // When must wait for local SDMA completions. -#ifdef PSM_OPA - // wait for completion of SDMA for sync control message send - PSMI_STATS_DECLU64("sdma_compl_wait_ctrl", - &proto->stats.sdma_compl_wait_ctrl), -#endif // wait for completion of SDMA as part of ACK processing. // got an ACK for original SDMA which we did not yet complete. // can imply late arrival of original at remote end after we @@ -2224,16 +1699,6 @@ ips_proto_register_stats(struct ips_proto *proto) PSMI_STATS_DECLU64("scb_unavail_eager_count", &proto->stats.scb_egr_unavail_cnt), -#ifdef PSM_OPA - PSMI_STATS_DECLU64("scb_unavail_exp_count", - &proto->stats.scb_exp_unavail_cnt), - PSMI_STATS_DECLU64("rcvhdr_overflows", /* Normal egr/hdr ovflw */ - &proto->stats.hdr_overflow), - PSMI_STATS_DECLU64("rcveager_overflows", - &proto->stats.egr_overflow), - PSMI_STATS_DECLU64("lid_zero_errs_(**)", /* shouldn't happen */ - &proto->stats.lid_zero_errs), -#endif // PSM_OPA PSMI_STATS_DECLU64("unknown_packets_(**)", /* shouldn't happen */ &proto->stats.unknown_packets), PSMI_STATS_DECLU64("stray_packets_(*)", @@ -2244,24 +1709,6 @@ ips_proto_register_stats(struct ips_proto *proto) PSMI_STATS_DECLU64("partial_read_cnt", &proto->stats.partial_read_cnt), #endif -#ifdef PSM_OPA - PSMI_STATS_DECLU64("pio_stalls_(*)", /* shouldn't happen too often */ - pio_stall_cnt), - PSMI_STATS_DECLU64("ICRC_error_(*)", - &proto->error_stats.num_icrc_err), - PSMI_STATS_DECLU64("ECC_error", - &proto->error_stats.num_ecc_err), - PSMI_STATS_DECLU64("Len_error", - &proto->error_stats.num_len_err), - PSMI_STATS_DECLU64("TID_error", - &proto->error_stats.num_tid_err), - PSMI_STATS_DECLU64("DC_error", - &proto->error_stats.num_dc_err), - PSMI_STATS_DECLU64("DCUNC_error", - &proto->error_stats.num_dcunc_err), - PSMI_STATS_DECLU64("KHDRLEN_error", - &proto->error_stats.num_khdrlen_err), -#endif // PSM_OPA PSMI_STATS_DECLU64("err_chk_send", &proto->epaddr_stats.err_chk_send), PSMI_STATS_DECLU64("err_chk_recv", @@ -2317,10 +1764,6 @@ ips_proto_register_stats(struct ips_proto *proto) PSMI_STATS_DECLU64("rdma_rexmit_(*)", &proto->epaddr_stats.rdma_rexmit), #endif -#endif -#ifdef PSM_OPA - PSMI_STATS_DECLU64("congestion_pkts", - &proto->epaddr_stats.congestion_pkts), #endif PSMI_STATS_DECLU64("tiny_cpu_isend", &proto->strat_stats.tiny_cpu_isend), diff --git a/psm3/ptl_ips/ips_proto.h b/psm3/ptl_ips/ips_proto.h index 132e0ee..c630208 100644 --- a/psm3/ptl_ips/ips_proto.h +++ b/psm3/ptl_ips/ips_proto.h @@ -66,7 +66,6 @@ #include "ips_tidflow.h" #include "ips_path_rec.h" -#ifndef PSM_OPA // when defined, this enables use of byte based flow credits in addition // to packet based. // It can help UDP to avoid overflowing the sockets kernel buffers. @@ -74,9 +73,6 @@ // memory at scale. // UD/RC, TCP and OPA HALs self configure so this has no effect #define PSM_BYTE_FLOW_CREDITS -#else -#undef PSM_BYTE_FLOW_CREDITS -#endif typedef enum ips_path_type { IPS_PATH_LOW_PRIORITY, @@ -93,28 +89,13 @@ typedef enum ips_path_type { */ struct ips_epinfo { __be16 ep_base_lid; -#ifdef PSM_OPA - uint8_t ep_baseqp; -#else -#endif uint8_t ep_hash; // for hashing adaptive dispersive routing uint8_t ep_lmc; enum psm3_ibv_rate ep_link_rate; -#ifdef PSM_OPA - uint16_t ep_context; - uint16_t ep_subcontext; - uint16_t ep_hfi_type; -#endif uint16_t ep_sl; /* PSM3_NIC_SL only when path record not used */ uint32_t ep_mtu; // PSM payload after potential hdr & PSM3_MTU decrease // or TCP increase beyond wire size -#ifdef PSM_OPA - uint16_t ep_piosize; -#endif uint16_t ep_pkey; /* PSM3_PKEY only when path record not used */ -#ifdef PSM_OPA - uint16_t ep_jkey; // for STL100 kdeth header -#endif uint64_t ep_timeout_ack; /* PSM3_ERRCHK_TIMEOUT if no path record */ uint64_t ep_timeout_ack_max; uint32_t ep_timeout_ack_factor; @@ -252,10 +233,6 @@ struct ips_proto_stats { uint64_t post_send_fail; #endif #ifdef PSM_HAVE_SDMA -#ifdef PSM_OPA - uint64_t sdma_busy_cnt; - uint64_t sdma_compl_wait_ctrl; -#endif uint64_t sdma_compl_wait_ack; uint64_t sdma_compl_wait_resend; uint64_t sdma_compl_slow; @@ -263,12 +240,6 @@ struct ips_proto_stats { #endif uint64_t scb_egr_unavail_cnt; -#ifdef PSM_OPA - uint64_t scb_exp_unavail_cnt; - uint64_t hdr_overflow; - uint64_t egr_overflow; - uint64_t lid_zero_errs; -#endif uint64_t unknown_packets; uint64_t stray_packets; #ifdef PSM_SOCKETS @@ -277,17 +248,6 @@ struct ips_proto_stats { #endif }; -#ifdef PSM_OPA -struct ips_proto_error_stats { - uint64_t num_icrc_err; - uint64_t num_ecc_err; - uint64_t num_len_err; - uint64_t num_tid_err; - uint64_t num_dc_err; - uint64_t num_dcunc_err; - uint64_t num_khdrlen_err; -}; -#endif /* * Updates to these stats must be reflected in ips_ptl_epaddr_stats_init @@ -325,9 +285,6 @@ struct ips_proto_epaddr_stats { uint64_t rdma_rexmit; #endif #endif -#ifdef PSM_OPA - uint64_t congestion_pkts; /* IB CCA FECN packets */ -#endif }; /* OPP support structure. */ @@ -342,11 +299,7 @@ struct opp_api { struct ips_ibta_compliance_fn { psm2_error_t(*get_path_rec) (struct ips_proto *proto, __be16 slid, __be16 dlid, -#ifndef PSM_OPA __be64 gid_hi, __be64 gid_lo, -#else - uint16_t desthfi_type, -#endif unsigned long timeout, ips_path_grp_t **ppathgrp); psm2_error_t(*fini) (struct ips_proto *proto); @@ -355,18 +308,12 @@ struct ips_ibta_compliance_fn { /* please don't change the flow id order */ typedef enum ips_epaddr_flow { EP_FLOW_GO_BACK_N_PIO, -#ifdef PSM_OPA - EP_FLOW_GO_BACK_N_DMA, -#endif EP_FLOW_TIDFLOW, /* Can either pio or dma for tidflow */ EP_FLOW_LAST /* Keep this the last endpoint flow */ } ips_epaddr_flow_t; typedef enum psm_transfer_type { PSM_TRANSFER_PIO, -#ifdef PSM_OPA - PSM_TRANSFER_DMA, -#endif PSM_TRANSFER_LAST /* Keep this the last transfer type */ } psm_transfer_type_t; @@ -391,29 +338,10 @@ struct ips_proto { struct ips_scbctrl scbc_egr; struct ips_epinfo epinfo; -#ifdef PSM_OPA - // TBD move this into gen1 HALs ep or psmi_context - ips_scb_t **sdma_scb_queue; - uint16_t sdma_queue_size; - uint16_t sdma_fill_index; - uint16_t sdma_done_index; - uint16_t sdma_avail_counter; -#endif uint64_t timeout_send; -#ifdef PSM_OPA - uint32_t flags; /* < if IPS_PROTO_FLAG_SDMA is NOT set, SPIO flow will be initialized - * < if IPS_PROTO_FLAG_SPIO is NOT set, SDMA flow will be initialized - * < so both flows (SDMA and PIO) will be initialized if both of the - * < IPS_PROTO_FLAG_S{DMA,PIO} are CLEARED - */ -#else uint32_t flags; -#endif -#ifdef PSM_OPA - uint32_t iovec_thresh_eager; - uint32_t iovec_thresh_eager_blocking; -#elif defined(PSM_HAVE_REG_MR) +#if defined(PSM_HAVE_REG_MR) // TBD adjust rest of Send DMA code to use PSM_HAVE_SDMA uint32_t iovec_thresh_eager; uint32_t iovec_thresh_eager_blocking; @@ -433,14 +361,8 @@ struct ips_proto { uint32_t flow_credit_bytes; // credit limit in bytes #endif mpool_t pend_sends_pool; -#ifdef PSM_OPA - mpool_t timer_pool; -#endif struct ips_ibta_compliance_fn ibta; struct ips_proto_stats stats; -#ifdef PSM_OPA - struct ips_proto_error_stats error_stats; -#endif struct ips_proto_epaddr_stats epaddr_stats; struct ptl_strategy_stats strat_stats; @@ -455,13 +377,6 @@ struct ips_proto { psm2_mr_cache_t mr_cache; #endif -#ifdef PSM_OPA - /* Handling tid errors */ - uint32_t tiderr_cnt; - uint32_t tiderr_max; - uint64_t tiderr_tnext; - uint64_t tiderr_warn_interval; -#endif uint64_t t_init; uint64_t t_fini; @@ -488,27 +403,6 @@ struct ips_proto { uint64_t count; } psmi_logevent_tid_send_reqs; -#ifdef PSM_OPA - /* SL2SC and SC2VL table for protocol */ - uint16_t sl2sc[32]; - /* CCA per port */ - uint16_t *cct; /* cct table */ - uint16_t ccti_size; /* ccti table size */ - uint16_t ccti_limit; /* should be <= size-1 */ - - uint16_t ccti_portctrl; /* QP or SL CC */ - uint32_t ccti_ctrlmap; /* map for valid sl */ - struct cace { /* CACongestionEntry */ - uint8_t ccti_increase; /* steps to increase */ - /* uint16_t ccti_timer;*/ /* CCTI Timer in units of 1.024 usec */ - uint64_t ccti_timer_cycles; /* converted from us_2_cycles() */ - uint8_t ccti_threshold; /* threshold to make log */ - uint8_t ccti_min; /* min value for ccti */ - } cace[32]; /* 32 service levels */ - - /* Path record support */ - uint8_t ips_ipd_delay[PSM3_IBV_RATE_300_GBPS + 1]; -#endif /* * Disable the LMC based dispersive routing for all message * sizes in bytes between ips_lmc_disable_low and ips_lmc_disable_high, @@ -559,19 +453,6 @@ struct ips_proto { time_t writevFailTime; }; -#ifdef PSM_OPA -static inline int -ips_proto_is_disabled_pio(struct ips_proto *proto) -{ - return !!(proto->flags & IPS_PROTO_FLAG_SDMA); -} - -static inline int -ips_proto_is_disabled_sdma(struct ips_proto *proto) -{ - return !!(proto->flags & IPS_PROTO_FLAG_SPIO); -} -#endif /* * Test the payload length against the lmc_disable_low and lmc_disable_hi @@ -619,9 +500,6 @@ struct ips_flow { uint16_t protocol:3; /* go-back-n or tidflow */ uint16_t flags:8; /* flow state flags */ -#ifdef PSM_OPA - uint16_t cca_ooo_pkts; /* cca out of order packets */ -#endif // TBD - cwin only needed for OPA for CCA uint16_t cwin; /* Size of congestion window in packets */ // to allow for good pipelining of send/ACK need to trigger an ack at @@ -774,16 +652,6 @@ struct ips_epaddr { int tcp_fd; } sockets; #endif /* PSM_SOCKETS */ -#ifdef PSM_OPA - struct { - // For PSM_OPA this is computed based on - // min(negotiated mtu * TID_MAX, mq->hfi_base_window_rv) - // For PSM_VERBS/UDP this is always mq->hfi_base_window_rv - uint32_t window_rv; /* RNDV window size per conn */ - uint8_t context; /* real context */ - uint8_t subcontext; /* sub context, 3 bits, 5 bits for future */ - } opa; -#endif /* PSM_OPA */ }; /* this portion is only for connect/disconnect */ @@ -878,22 +746,10 @@ void MOCKABLE(psm3_ips_proto_flow_enqueue)(struct ips_flow *flow, ips_scb_t *scb MOCK_DCL_EPILOGUE(psm3_ips_proto_flow_enqueue); psm2_error_t psm3_ips_proto_flow_flush_pio(struct ips_flow *flow, int *nflushed); -#ifdef PSM_OPA -psm2_error_t ips_proto_flow_flush_dma(struct ips_flow *flow, int *nflushed); -#endif /* Wrapper for enqueue + flush */ psm2_error_t ips_proto_scb_pio_send(struct ips_flow *flow, ips_scb_t *scb); -#ifdef PSM_OPA -void ips_proto_scb_dma_enqueue(struct ips_proto *proto, ips_scb_t *scb); -psm2_error_t ips_proto_scb_dma_flush(struct ips_proto *proto, - ips_epaddr_t *ipsaddr, int *nflushed); -psm2_error_t ips_dma_transfer_frame(struct ips_proto *proto, - struct ips_flow *flow, ips_scb_t *scb, - void *payload, uint32_t paylen, - uint32_t have_cksum, uint32_t cksum); -#endif #ifdef PSM_HAVE_SDMA psm2_error_t ips_proto_dma_wait_until(struct ips_proto *proto, ips_scb_t *scb); #endif @@ -963,24 +819,6 @@ MOCK_DCL_EPILOGUE(psm3_ips_ibta_init); psm2_error_t psm3_ips_ibta_fini(struct ips_proto *proto); -#ifdef PSM_OPA -PSMI_ALWAYS_INLINE( -struct psm_hal_sdma_req_info * -psm3_get_sdma_req_info(struct ips_scb *scb, size_t *extra)) -{ - *extra = 0; -#ifdef PSM_CUDA - if (PSMI_IS_DRIVER_GPUDIRECT_DISABLED) - return (struct psm_hal_sdma_req_info *)(((char *)&scb->pbc) - - (sizeof(struct psm_hal_sdma_req_info) - - PSM_HAL_CUDA_SDMA_REQ_INFO_EXTRA)); - - *extra = PSM_HAL_CUDA_SDMA_REQ_INFO_EXTRA; -#endif // PSM_CUDA - - return (struct psm_hal_sdma_req_info *)(((char *)&scb->pbc) - (sizeof(struct psm_hal_sdma_req_info))); -} -#endif // PSM_OPA #if defined(PSM_CUDA) || defined(PSM_ONEAPI) PSMI_ALWAYS_INLINE( @@ -995,21 +833,5 @@ uint32_t ips_cuda_next_window(uint32_t max_window, uint32_t offset, } #endif -#ifdef PSM_OPA -/* Determine if FECN bit is set IBTA 1.2.1 CCA Annex A*/ - -static __inline__ uint8_t -_is_cca_fecn_set(const struct ips_message_header *p_hdr) -{ - return (__be32_to_cpu(p_hdr->bth[1]) >> HFI_BTH_FECN_SHIFT) & 0x1; -} - -/* Detrmine if BECN bit is set IBTA 1.2.1 CCA Annex A*/ -static __inline__ uint8_t -_is_cca_becn_set(const struct ips_message_header *p_hdr) -{ - return (__be32_to_cpu(p_hdr->bth[1]) >> HFI_BTH_BECN_SHIFT) & 0x1; -} -#endif #endif /* _IPS_PROTO_H */ diff --git a/psm3/ptl_ips/ips_proto_am.c b/psm3/ptl_ips/ips_proto_am.c index 11dde28..79989c7 100644 --- a/psm3/ptl_ips/ips_proto_am.c +++ b/psm3/ptl_ips/ips_proto_am.c @@ -103,11 +103,7 @@ MOCKABLE(psm3_ips_proto_am_init)(struct ips_proto *proto, struct ips_proto_am *proto_am) { psm2_error_t err = PSM2_OK; -#ifdef PSM_OPA - int send_buf_size = psmi_hal_get_pio_size(proto->ep->context.psm_hw_ctxt); -#else int send_buf_size = proto->epinfo.ep_mtu; -#endif int num_rep_slots = calc_optimal_num_reply_slots(num_send_slots); int num_req_slots = num_send_slots - num_rep_slots; @@ -178,11 +174,7 @@ psm3_ips_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters) { int max_nargs = min(1 << IPS_AM_HDR_NARGS_BITS, PSMI_AM_MAX_ARGS); int max_payload = -#ifdef PSM_OPA - psmi_hal_get_pio_size(ep->context.psm_hw_ctxt) - -#else ep->mtu - -#endif ((max_nargs - IPS_AM_HDR_NARGS) * sizeof(psm2_amarg_t)); if (parameters == NULL) { diff --git a/psm3/ptl_ips/ips_proto_connect.c b/psm3/ptl_ips/ips_proto_connect.c index 9ae35ba..408d583 100644 --- a/psm3/ptl_ips/ips_proto_connect.c +++ b/psm3/ptl_ips/ips_proto_connect.c @@ -161,14 +161,6 @@ ips_ipsaddr_configure_flows(struct ips_epaddr *ipsaddr, struct ips_proto *proto) ipsaddr, PSM_TRANSFER_PIO, PSM_PROTOCOL_GO_BACK_N, IPS_PATH_NORMAL_PRIORITY, EP_FLOW_GO_BACK_N_PIO); -#ifdef PSM_OPA - /* DMA flow uses the low priority path, multi MTU sized eager - * message uses the same flow to transfer to avoid out of order. - */ - psm3_ips_flow_init(&ipsaddr->flows[EP_FLOW_GO_BACK_N_DMA], proto, - ipsaddr, PSM_TRANSFER_DMA, PSM_PROTOCOL_GO_BACK_N, - IPS_PATH_LOW_PRIORITY, EP_FLOW_GO_BACK_N_DMA); -#endif } /* @@ -207,9 +199,6 @@ static psm2_epaddr_t ips_alloc_epaddr(struct ips_proto *proto, int master, psm2_epid_t epid, const char *hostname, -#ifdef PSM_OPA - uint16_t hfi_type, -#endif unsigned long timeout, psm2_error_t *err_out); /* we check connect_verno and parse the epid @@ -241,19 +230,12 @@ static int ips_proto_connect_hdr_parse(void *payload, uint32_t paylen, psm2_epid // connect_hdr, so a failed connect due to connect_verno mismatch // can't really be replied to with an error in req->connect_result // so we just exit with a fatal error here. -#ifdef PSM_OPA - if (hdr->connect_verno < IPS_CONNECT_VERNO) - goto bad_verno; -#endif // for now we are strict about major rev, if we add additional optional // features they can be minor revs and may need more sophisticated handling if (IPS_CONNECT_VER_MAJOR(hdr->connect_verno) == IPS_CONNECT_VER_MAJOR(IPS_CONNECT_VERNO)) { *epid = psm3_epid_pack_words(hdr->epid_w[0], hdr->epid_w[1], hdr->epid_w[2]); } else { -#ifdef PSM_OPA -bad_verno: -#endif psm3_handle_error(PSMI_EP_NORETURN, PSM2_EPID_INVALID_VERSION, "Connect protocol (%x.%x) is incompatible with %x.%x", IPS_CONNECT_VER_MAJOR(hdr->connect_verno), @@ -292,22 +274,10 @@ ips_ipsaddr_set_req_params(struct ips_proto *proto, // common_mtu will be further reduced by pr_mtu to set frag_size and RC mtu uint32_t common_mtu = min(req->mtu, proto->epinfo.ep_mtu); psmi_assert_always(req->static_rate > 0); -#ifndef PSM_OPA enum psm3_ibv_rate common_rate = min_rate(req->static_rate, proto->epinfo.ep_link_rate); -#endif int ptype, pidx; -#ifdef PSM_OPA - /* - * Make RNDV window size being dependent on MTU size; - * This is due to fact that number of send packets - * within a given window must not exceed 2048 (@ref PSM_TID_MAX_PKTS). - * Use smaller of two values: - * unified MTU * PSM_TID_MAX_PKTS vs already configured window size. - */ - ipsaddr->opa.window_rv = min(common_mtu * PSM_TID_MAX_PKTS, proto->mq->hfi_base_window_rv); -#endif /* * For static routes i.e. "none" path resolution update all paths to @@ -323,19 +293,15 @@ ips_ipsaddr_set_req_params(struct ips_proto *proto, if (proto->ep->path_res_type == PSM2_PATH_RES_NONE) { ipsaddr->pathgrp->pg_path[pidx][ptype]->pr_mtu = common_mtu; -#ifndef PSM_OPA ipsaddr->pathgrp->pg_path[pidx][ptype]->pr_static_rate = common_rate; -#endif } else { ipsaddr->pathgrp->pg_path[pidx][ptype]->pr_mtu = min(common_mtu, ipsaddr->pathgrp->pg_path[pidx][ptype]->pr_mtu); -#ifndef PSM_OPA ipsaddr->pathgrp->pg_path[pidx][ptype]->pr_static_rate = min_rate(common_rate, ipsaddr->pathgrp->pg_path[pidx][ptype]->pr_static_rate); -#endif } } @@ -402,17 +368,10 @@ ips_ipsaddr_set_req_params(struct ips_proto *proto, psm2_epid_t rail_epid; psmi_subnet128_t rail_subnet; -#ifdef PSM_OPA - // 3 64b word rail_addr, but only 1 word epid - rail_epid = psm3_epid_pack_word(rail_addr[0]); - rail_subnet = psmi_subnet_pack(rail_epid, rail_addr[1]); - // ignore 3rd word of rail_addr (should be 0) -#else // 3 64b word rail_addr with 3 64b word epid // epid contains subnet (IPv6 subnet prefix) rail_epid = psm3_epid_pack_words(rail_addr[0], rail_addr[1], rail_addr[2]); rail_subnet = psm3_epid_subnet(rail_epid); -#endif // match rails by address format and full subnet // and associate with matching local ep @@ -421,9 +380,6 @@ ips_ipsaddr_set_req_params(struct ips_proto *proto, epaddr = ips_alloc_epaddr(&((struct ptl_ips *)(ep->ptl_ips.ptl))->proto, 0, rail_epid, NULL, -#ifdef PSM_OPA - PSMI_HFI_TYPE_OPA1, -#endif 5000, &err); if (epaddr == NULL) return err; @@ -583,20 +539,6 @@ ips_proto_build_connect_message(struct ips_proto *proto, while (ep != proto->ep) { psmi_assert(PSMI_EPID_LEN <= IPS_CONNECT_RAIL_ADDR_LEN); -#ifdef PSM_OPA - // 3 64b word rail_addr, but only 1 word epid - // epid 1st so can parse size - *data = psm3_epid_w0(ep->epid); - paylen += sizeof(uint64_t); - data++; - *data = psm3_epid_subnet_extra_word(ep->subnet); - paylen += sizeof(uint64_t); - data++; - - *data = 0; - paylen += sizeof(uint64_t); - data++; -#else // 3 64b word rail_addr with 3 64b word epid // epid contains full subnet *data = psm3_epid_w0(ep->epid); @@ -610,7 +552,6 @@ ips_proto_build_connect_message(struct ips_proto *proto, *data = psm3_epid_w2(ep->epid); paylen += sizeof(uint64_t); data++; -#endif psmi_assert_always(paylen <= max_paylen); ep = ep->mctxt_next; } @@ -693,9 +634,6 @@ static psm2_epaddr_t ips_alloc_epaddr(struct ips_proto *proto, int master, psm2_epid_t epid, const char *hostname, -#ifdef PSM_OPA - uint16_t hfi_type, -#endif unsigned long timeout, psm2_error_t *err_out) { psm2_error_t err = PSM2_OK; @@ -703,9 +641,7 @@ ips_alloc_epaddr(struct ips_proto *proto, int master, psm2_epid_t epid, ips_epaddr_t *ipsaddr; ips_path_grp_t *pathgrp; uint16_t lid; -#ifndef PSM_OPA psmi_gid128_t gid; -#endif /* The PSM/PTL-level epaddr, ips-level epaddr, and per-peer msgctl * structures are collocated in memory for performance reasons -- this is @@ -768,21 +704,13 @@ ips_alloc_epaddr(struct ips_proto *proto, int master, psm2_epid_t epid, /* get HAL specific addressing fields initialized in ipsaddr as well as * fetching lid and gid for our path record query */ -#ifdef PSM_OPA - psmi_hal_ips_ipsaddr_init_addressing(proto, epid, ipsaddr, &lid); -#else psmi_hal_ips_ipsaddr_init_addressing(proto, epid, ipsaddr, &lid, &gid); -#endif /* Get path record for tuple */ err = proto->ibta.get_path_rec(proto, proto->epinfo.ep_base_lid, /* __be16 */ __cpu_to_be16(lid), -#ifndef PSM_OPA __cpu_to_be64(gid.hi), __cpu_to_be64(gid.lo), -#else - hfi_type, -#endif timeout, &pathgrp); if (err != PSM2_OK) { @@ -939,32 +867,21 @@ psm3_ips_proto_process_connect(struct ips_proto *proto, uint8_t opcode, if (ipsaddr == NULL) { ips_path_grp_t *pathgrp; uint16_t lid; -#ifndef PSM_OPA psmi_gid128_t gid; -#endif ipsaddr = &ipsaddr_f; memset(&ipsaddr_f, 0, sizeof(ips_epaddr_t)); ipsaddr_f.hash = psm3_epid_context(epid); -#ifdef PSM_OPA - psmi_hal_ips_ipsaddr_init_addressing(proto, - epid, &ipsaddr_f, &lid); -#else psmi_hal_ips_ipsaddr_init_addressing(proto, epid, &ipsaddr_f, &lid, &gid); -#endif /* Get path record for peer */ err = proto->ibta.get_path_rec(proto, proto->epinfo. ep_base_lid, /* __be16 */ __cpu_to_be16(lid), -#ifndef PSM_OPA __cpu_to_be64(gid.hi), __cpu_to_be64(gid.lo), -#else - PSMI_HFI_TYPE_OPA1, -#endif 3000, &pathgrp); if (err != PSM2_OK) goto fail; @@ -1059,9 +976,6 @@ ptl_handle_connect_req(struct ips_proto *proto, psm2_epaddr_t epaddr, newconnect = 1; if ((epaddr = ips_alloc_epaddr(proto, 1, epid, req->hostname, -#ifdef PSM_OPA - PSMI_HFI_TYPE_OPA1, -#endif 5000, &err)) == NULL) { goto fail; } @@ -1107,7 +1021,6 @@ ptl_handle_connect_req(struct ips_proto *proto, psm2_epaddr_t epaddr, psm3_epid_str_addr_fmt(proto->ep->epid), proto->ep->addr_fmt); connect_result = PSM2_EPID_INVALID_CONNECT; -#ifndef PSM_OPA } else if (psm3_epid_protocol(epid) != psm3_epid_protocol(proto->ep->epid)) { // before connections started, sender should have confirmed // epid formats match for master and each rail @@ -1120,7 +1033,6 @@ ptl_handle_connect_req(struct ips_proto *proto, psm2_epaddr_t epaddr, psm3_epid_str_protocol(proto->ep->epid), psm3_epid_protocol(proto->ep->epid)); connect_result = PSM2_EPID_INVALID_CONNECT; -#endif /* PSM_OPA */ } else if (!(proto->flags & IPS_PROTO_FLAG_QUERY_PATH_REC) && proto->epinfo.ep_pkey != psmi_hal_get_default_pkey() && proto->epinfo.ep_pkey != req->job_pkey) { @@ -1137,11 +1049,7 @@ ptl_handle_connect_req(struct ips_proto *proto, psm2_epaddr_t epaddr, connect_result = PSM2_EPID_INVALID_CONNECT; _HFI_ERROR("Remote Connection error (%s %s): %s Wire Mode mismatch (local:%d, remote:%d)\n", req->hostname, psm3_epid_fmt_addr(epid, 0), -#ifndef PSM_OPA psm3_epid_str_protocol(epid), -#else - "", -#endif proto->ep->wiremode, req->wiremode); } else { connect_result = PSM2_OK; @@ -1270,9 +1178,6 @@ psm3_ips_proto_connect(struct ips_proto *proto, int numep, // so we lack it's hostname, rv and qpn info epaddr = ips_alloc_epaddr(proto, 1, array_of_epid[i], NULL, -#ifdef PSM_OPA - PSMI_HFI_TYPE_OPA1, -#endif (timeout_in / 1000000UL), &err); if (epaddr == NULL) { _HFI_ERROR("Unable to issue connect from %s to %s: %s\n", @@ -1686,12 +1591,6 @@ psm3_ips_proto_disconnect(struct ips_proto *proto, int force, int numep, !STAILQ_EMPTY(&ipsaddr->flows [EP_FLOW_GO_BACK_N_PIO]. scb_unacked) -#ifdef PSM_OPA - || - !STAILQ_EMPTY(&ipsaddr->flows - [EP_FLOW_GO_BACK_N_DMA]. - scb_unacked) -#endif ; if (has_pending) continue; diff --git a/psm3/ptl_ips/ips_proto_connect.h b/psm3/ptl_ips/ips_proto_connect.h index 3298862..51f1f9a 100644 --- a/psm3/ptl_ips/ips_proto_connect.h +++ b/psm3/ptl_ips/ips_proto_connect.h @@ -67,13 +67,9 @@ * version will be added later for scalability. * version kept in 2 nibbles in this format: 0xMMmm MM=major, mm=minor version */ -#ifdef PSM_OPA -#define IPS_CONNECT_VERNO 0x0002 // 0.2 -#else // a litle paranod as a UD or UDP connect can't reach a STL100 PSM recv context // but we don't worry about UDP vs UD since can't reach eachother either #define IPS_CONNECT_VERNO 0x0200 // 2.0 - epid_size of 24 bytes (3 word) -#endif #define IPS_CONNECT_VER_MAJOR(verno) (((verno) & 0xff00) >> 8) #define IPS_CONNECT_VER_MINOR(verno) ((verno) & 0x00ff) @@ -159,13 +155,7 @@ struct ips_connect_reqrep { #define IPS_CONNECT_RAIL_ADDR_LEN (3*sizeof(uint64_t)) // length in bytes // For a multi-rail and/or multi-QP run, Up to PSMI_MAX_QPS of rail_addr // follow (24 bytes per rail). -#ifdef PSM_OPA - // epid - 1 word epid formats (8 bytes) - // subnet - (hi 8 bytes of psmi_subnet128) - // 8 bytes of zero (reserved) -#else // 3 word epid format - has full IB/OPA/IPv4/IPv6 subnet -#endif // if we run out of space in a future IPS_CONNECT_VERNO we could // probably compact the IPv6 epid into 20 bytes per rail but leave at // 24 bytes in connect_hdr for good field alignment diff --git a/psm3/ptl_ips/ips_proto_dump.c b/psm3/ptl_ips/ips_proto_dump.c index 8bb277e..b603e9e 100644 --- a/psm3/ptl_ips/ips_proto_dump.c +++ b/psm3/ptl_ips/ips_proto_dump.c @@ -116,11 +116,7 @@ void psm3_ips_proto_show_header(struct ips_message_header *p_hdr, char *msg) printf("BTH: OpCode8-SE1-M1-PC2-TVer4-Pkey16 %x\n", __be32_to_cpu(p_hdr->bth[0])); -#ifdef PSM_OPA - printf("BTH: F1-B1-Res6-DestQP24 %x\n", __be32_to_cpu(p_hdr->bth[1])); -#else printf("BTH: Res24-Flow8 %x\n", __be32_to_cpu(p_hdr->bth[1])); -#endif printf("BTH: A1-PSN31 %x\n", __be32_to_cpu(p_hdr->bth[2])); printf("IPH: jkey-hcrc %x\n", __le32_to_cpu(p_hdr->khdr.kdeth1)); @@ -130,26 +126,8 @@ void psm3_ips_proto_show_header(struct ips_message_header *p_hdr, char *msg) printf("opcode %x\n", _get_proto_hfi_opcode(p_hdr)); ack_seq_num.psn_num = p_hdr->ack_seq_num; -#ifdef PSM_OPA - if (GET_HFI_KHDR_TIDCTRL(__le32_to_cpu(p_hdr->khdr.kdeth0))) - printf("TidFlow Flow: %x, Gen: %x, Seq: %x\n", - (__be32_to_cpu(p_hdr->bth[1]) >> - HFI_BTH_FLOWID_SHIFT) & HFI_BTH_FLOWID_MASK, - (__be32_to_cpu(p_hdr->bth[2]) >> - HFI_BTH_GEN_SHIFT) & HFI_BTH_GEN_MASK, - (__be32_to_cpu(p_hdr->bth[2]) >> - HFI_BTH_SEQ_SHIFT) & HFI_BTH_SEQ_MASK); - else if (ips_proto_flowid(p_hdr) == EP_FLOW_TIDFLOW) - printf("ack_seq_num gen %x, seq %x\n", - ack_seq_num.psn_gen, ack_seq_num.psn_seq); - else -#endif printf("ack_seq_num %x\n", ack_seq_num.psn_num); printf("src_rank/connidx %x\n", p_hdr->connidx); -#ifdef PSM_OPA - if (GET_HFI_KHDR_TIDCTRL(__le32_to_cpu(p_hdr->khdr.kdeth0))) - printf("tid_session_gen %d\n", p_hdr->exp_rdescid_genc); -#endif printf("flags %x\n", p_hdr->flags); } diff --git a/psm3/ptl_ips/ips_proto_expected.c b/psm3/ptl_ips/ips_proto_expected.c index b54a606..435b89d 100644 --- a/psm3/ptl_ips/ips_proto_expected.c +++ b/psm3/ptl_ips/ips_proto_expected.c @@ -90,16 +90,8 @@ static void ips_tid_reissue_rdma_write(struct ips_tid_send_desc *tidsendc); #endif static void ips_tid_scbavail_callback(struct ips_scbctrl *scbc, void *context); -#ifdef PSM_OPA -static void ips_tid_avail_callback(struct ips_tid *tidc, void *context); -#endif static void ips_tidflow_avail_callback(struct ips_tf *tfc, void *context); -#ifdef PSM_OPA -/* Defined at the ptl-level (breaks abstractions but needed for shared vs - * non-shared contexts */ -extern int psm3_gen1_ips_ptl_recvq_isempty(const struct ptl *ptl); -#endif #ifdef PSM_HAVE_RDMA static psm2_error_t ips_tid_recv_free(struct ips_tid_recv_desc *tidrecvc); @@ -128,12 +120,8 @@ MOCKABLE(psm3_ips_protoexp_init)(const struct ips_proto *proto, { struct ips_protoexp *protoexp = NULL; psm2_ep_t ep = proto->ep; -#ifdef PSM_OPA - uint32_t tidmtu_max; -#endif psm2_error_t err = PSM2_OK; -#ifndef PSM_OPA #ifdef PSM_HAVE_REG_MR if (!psmi_hal_has_cap(PSM_HAL_CAP_RDMA)) { #else @@ -143,7 +131,6 @@ MOCKABLE(psm3_ips_protoexp_init)(const struct ips_proto *proto, err = PSM2_INTERNAL_ERR; goto fail; } -#endif protoexp = (struct ips_protoexp *) psmi_calloc(ep, UNDEFINED, 1, sizeof(struct ips_protoexp)); @@ -156,49 +143,12 @@ MOCKABLE(psm3_ips_protoexp_init)(const struct ips_proto *proto, protoexp->ptl = (const struct ptl *)proto->ptl; protoexp->proto = (struct ips_proto *)proto; protoexp->timerq = proto->timerq; -#ifdef PSM_OPA - srand48_r((long int) getpid(), &protoexp->tidflow_drand48_data); -#endif protoexp->tid_flags = protoexp_flags; if (ep->memmode == PSMI_MEMMODE_MINIMAL) { protoexp->tid_flags |= IPS_PROTOEXP_FLAG_CTS_SERIALIZED; } -#ifdef PSM_OPA - // for RDMA Rendezvous we use a single MR for the message so - // we only need 1 entry in the CTS. - // For native mode, the CTS contains a list of TIDs and the window's - // size must be constrained such that the list for all pages in a window - // won't exceed an MTU (eg. CTS message must fit in an MTU) - { - /* - * Adjust the session window size so that tid-grant (CTS) message can - * fit into a single frag size packet for single transfer, PSM - * must send tid-grant message with a single packet. - */ - uint32_t fragsize, winsize; - -#ifndef PSM_OPA - fragsize = proto->epinfo.ep_mtu; -#else - if (proto->flags & IPS_PROTO_FLAG_SDMA) - fragsize = proto->epinfo.ep_mtu; - else - fragsize = proto->epinfo.ep_piosize; -#endif - - winsize = 2 * PSMI_PAGESIZE /* bytes per tid-pair */ - /* space in packet */ - * min((fragsize - sizeof(ips_tid_session_list)), - /* space in tidsendc/tidrecvc descriptor */ - PSM_TIDLIST_BUFSIZE) - / sizeof(uint32_t); /* convert to tid-pair */ - - if (proto->mq->hfi_base_window_rv > winsize) - proto->mq->hfi_base_window_rv = winsize; - } -#endif /* Must be initialized already */ /* Comment out because of Klockwork scanning critical error. CQ 11/16/2012 @@ -212,21 +162,7 @@ MOCKABLE(psm3_ips_protoexp_init)(const struct ips_proto *proto, protoexp->tid_sreq_pool = proto->ep->mq->sreq_pool; protoexp->tid_rreq_pool = proto->ep->mq->rreq_pool; -#ifdef PSM_OPA - /* tid traffic xfer type */ - if (proto->flags & IPS_PROTO_FLAG_SPIO) - protoexp->tid_xfer_type = PSM_TRANSFER_PIO; - else - protoexp->tid_xfer_type = PSM_TRANSFER_DMA; - - /* ctrl ack/nak xfer type */ - if (proto->flags & IPS_PROTO_FLAG_SDMA) - protoexp->ctrl_xfer_type = PSM_TRANSFER_DMA; - else - protoexp->ctrl_xfer_type = PSM_TRANSFER_PIO; -#else protoexp->ctrl_xfer_type = PSM_TRANSFER_PIO; -#endif /* Initialize tid flow control. */ err = psm3_ips_tf_init(protoexp, &protoexp->tfc, @@ -234,63 +170,12 @@ MOCKABLE(psm3_ips_protoexp_init)(const struct ips_proto *proto, if (err != PSM2_OK) goto fail; -#ifdef PSM_OPA - if (proto->flags & IPS_PROTO_FLAG_SPIO) - tidmtu_max = proto->epinfo.ep_piosize; - else - tidmtu_max = proto->epinfo.ep_mtu; - - protoexp->tid_send_fragsize = tidmtu_max; - - if ((err = ips_tid_init(&ep->context, protoexp, - ips_tid_avail_callback, protoexp))) - goto fail; -#endif if ((err = psm3_ips_scbctrl_init(ep, num_of_send_desc, 0, 0, 0, ips_tid_scbavail_callback, protoexp, &protoexp->tid_scbc_rv))) goto fail; -#ifdef PSM_OPA - { - /* Determine interval to generate headers (relevant only when header - * suppression is enabled) else headers will always be generated. - * - * The PSM3_EXPECTED_HEADERS environment variable can specify the - * packet interval to generate headers at. Else a header packet is - * generated every - * min(PSM_DEFAULT_EXPECTED_HEADER, window_size/tid_send_fragsize). - * Note: A header is always generated for the last packet in the flow. - */ - - union psmi_envvar_val env_exp_hdr; - uint32_t defval = min(PSM_DEFAULT_EXPECTED_HEADER, - proto->mq->hfi_base_window_rv / - protoexp->tid_send_fragsize); - - psm3_getenv("PSM3_EXPECTED_HEADERS", - "Interval to generate expected protocol headers", - PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, - (union psmi_envvar_val)defval, &env_exp_hdr); - - protoexp->hdr_pkt_interval = env_exp_hdr.e_uint; - /* Account for flow credits - Should try to have atleast 4 headers - * generated per window. - */ - protoexp->hdr_pkt_interval = - max(min - (protoexp->hdr_pkt_interval, proto->flow_credits >> 2), - 1); - - if (protoexp->hdr_pkt_interval != env_exp_hdr.e_uint) { - _HFI_VDBG - ("Overriding PSM3_EXPECTED_HEADERS=%u to be '%u'\n", - env_exp_hdr.e_uint, protoexp->hdr_pkt_interval); - } - - } -#endif { union psmi_envvar_val env_rts_cts_interleave; @@ -367,36 +252,6 @@ MOCKABLE(psm3_ips_protoexp_init)(const struct ips_proto *proto, #endif -#ifdef PSM_OPA - protoexp->tid_page_offset_mask = PSMI_PAGESIZE - 1; - protoexp->tid_page_mask = ~(PSMI_PAGESIZE - 1); - - /* - * After ips_tid_init(), we know if we use tidcache or not. - * if tid cache is used, we can't use tid debug. - */ -#ifdef PSM_DEBUG - if (protoexp->tidc.tid_array == NULL) - protoexp->tid_flags |= IPS_PROTOEXP_FLAG_TID_DEBUG; -#endif - - if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_TID_DEBUG) { - int i; - protoexp->tid_info = (struct ips_tidinfo *) - psmi_calloc(ep, UNDEFINED, IPS_TID_MAX_TIDS, - sizeof(struct ips_tidinfo)); - if (protoexp->tid_info == NULL) { - err = PSM2_NO_MEMORY; - goto fail; - } - for (i = 0; i < IPS_TID_MAX_TIDS; i++) { - protoexp->tid_info[i].state = TIDSTATE_FREE; - protoexp->tid_info[i].tidrecvc = NULL; - protoexp->tid_info[i].tid = 0xFFFFFFFF; - } - } else - protoexp->tid_info = NULL; -#endif // PSM_OPA #if defined(PSM_CUDA) || defined(PSM_ONEAPI) { @@ -494,19 +349,11 @@ psm2_error_t psm3_ips_protoexp_fini(struct ips_protoexp *protoexp) if ((err = psm3_ips_scbctrl_fini(&protoexp->tid_scbc_rv))) goto fail; -#ifdef PSM_OPA - if ((err = ips_tid_fini(&protoexp->tidc))) - goto fail; -#endif /* finalize tid flow control. */ if ((err = psm3_ips_tf_fini(&protoexp->tfc))) goto fail; -#ifdef PSM_OPA - if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_TID_DEBUG) - psmi_free(protoexp->tid_info); -#endif psmi_free(protoexp); @@ -547,20 +394,6 @@ void ips_tid_mravail_callback(struct ips_proto *proto) } #endif -#ifdef PSM_OPA -/* New Tids are available. If there are pending get requests put the - * get timer on the timerq so it can be processed. */ -static -void ips_tid_avail_callback(struct ips_tid *tidc, void *context) -{ - struct ips_protoexp *protoexp = (struct ips_protoexp *)context; - - if (!STAILQ_EMPTY(&protoexp->pend_getreqsq)) - psmi_timer_request(protoexp->timerq, - &protoexp->timer_getreqs, PSMI_TIMER_PRIO_1); - return; -} -#endif // On STL100 ips_tf is a user space control for the HW tidflow which // would fully process most valid inbound EXPTID packets within an RV Window. @@ -615,18 +448,11 @@ psm3_ips_protoexp_tid_get_from_token(struct ips_protoexp *protoexp, { struct ips_tid_get_request *getreq; int count; -#ifdef PSM_OPA - int tids; -#endif int tidflows; uint64_t nbytes; PSM2_LOG_MSG("entering"); -#ifdef PSM_OPA - psmi_assert((((ips_epaddr_t *) epaddr)->opa.window_rv % PSMI_PAGESIZE) == 0); -#else psmi_assert((req->mq->hfi_base_window_rv % PSMI_PAGESIZE) == 0); -#endif getreq = (struct ips_tid_get_request *) psm3_mpool_get(protoexp->tid_getreq_pool); @@ -656,9 +482,7 @@ psm3_ips_protoexp_tid_get_from_token(struct ips_protoexp *protoexp, ((req->is_buf_gpu_mem && (protoexp->proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV) && (length > gpudirect_rdma_recv_limit -#ifndef PSM_OPA || length & 0x03 || (uintptr_t)buf & 0x03 -#endif )))) { getreq->gpu_hostbuf_used = 1; getreq->tidgr_cuda_bytesdone = 0; @@ -688,37 +512,21 @@ psm3_ips_protoexp_tid_get_from_token(struct ips_protoexp *protoexp, #endif nbytes = PSMI_ALIGNUP((length + count - 1) / count, PSMI_PAGESIZE); getreq->tidgr_rndv_winsz = -#ifndef PSM_OPA min(nbytes, req->mq->hfi_base_window_rv); -#else - min(nbytes, ((ips_epaddr_t *) epaddr)->opa.window_rv); - /* must be within the tid window size */ - if (getreq->tidgr_rndv_winsz > PSM_TID_WINSIZE) - getreq->tidgr_rndv_winsz = PSM_TID_WINSIZE; -#endif _HFI_MMDBG("posting TID get request: nbytes=%"PRIu64" winsz=%u len=%u\n", nbytes, getreq->tidgr_rndv_winsz, getreq->tidgr_length); // we have now computed the size of each TID sequence (tidgr_rndv_winsz) STAILQ_INSERT_TAIL(&protoexp->pend_getreqsq, getreq, tidgr_next); -#ifdef PSM_OPA - tids = ips_tid_num_available(&protoexp->tidc); -#endif // by using tidflow we also constrain amount of concurrent RDMA to our NIC tidflows = ips_tf_available(&protoexp->tfc); _HFI_MMDBG("available tidflow %u\n", tidflows); if ( -#ifdef PSM_OPA - tids > 0 && -#endif tidflows > 0) // get the actual TIDs and tidflows and send the CTS ips_tid_pendtids_timer_callback(&protoexp->timer_getreqs, 0); else if ( -#ifdef PSM_OPA - tids != -1 && -#endif tidflows != -1) // out of TIDs, set a timer to try again later psmi_timer_request(protoexp->timerq, &protoexp->timer_getreqs, @@ -746,11 +554,7 @@ void ips_logevent_inner(struct ips_proto *proto, int eventid, void *context) if (t_now >= proto->psmi_logevent_tid_send_reqs.next_warning) { psm3_handle_error(PSMI_EP_LOGEVENT, PSM2_OK, -#ifndef PSM_OPA "Non-fatal temporary exhaustion of send rdma descriptors " -#else - "Non-fatal temporary exhaustion of send tid dma descriptors " -#endif "(elapsed=%.3fs, source %s, count=%lld)", (double) cycles_to_nanosecs(t_now - @@ -809,11 +613,7 @@ psm3_ips_protoexp_send_tid_grant(struct ips_tid_recv_desc *tidrecvc) scb->ips_lrh.data[1].u32w0 = tidrecvc->getreq->tidgr_sendtoken; ips_scb_buffer(scb) = (void *)&tidrecvc->tid_list; -#ifndef PSM_OPA scb->chunk_size = ips_scb_length(scb) = sizeof(tidrecvc->tid_list); -#else - scb->chunk_size = ips_scb_length(scb) = tidrecvc->tsess_tidlist_length; -#endif _HFI_MMDBG("sending CTS\n"); PSM2_LOG_EPM(OPCODE_LONG_CTS,PSM2_LOG_TX, proto->ep->epid, @@ -825,42 +625,6 @@ psm3_ips_protoexp_send_tid_grant(struct ips_tid_recv_desc *tidrecvc) flow->flush(flow, NULL); } -#ifdef PSM_OPA -// build and send EXPTID completion ACK. Indicates receiever has gotten -// all TIDs for a given CTS -// for RC QP RDMA, we can use the RC send completion on sender to know -// when all data has been successfully delivered -void -ips_protoexp_send_tid_completion(struct ips_tid_recv_desc *tidrecvc, - ptl_arg_t sdescid) -{ - ips_epaddr_t *ipsaddr = tidrecvc->ipsaddr; - struct ips_proto *proto = tidrecvc->protoexp->proto; - psmi_assert(proto->msgflowid < EP_FLOW_LAST); - struct ips_flow *flow = &ipsaddr->flows[proto->msgflowid]; - ips_scb_t *scb; - - PSM2_LOG_EPM(OPCODE_EXPTID_COMPLETION,PSM2_LOG_TX, proto->ep->epid, - flow->ipsaddr->epaddr.epid ,"sdescid._desc_idx: %d", - sdescid._desc_idx); - scb = tidrecvc->completescb; - - ips_scb_opcode(scb) = OPCODE_EXPTID_COMPLETION; - scb->ips_lrh.khdr.kdeth0 = 0; - scb->ips_lrh.data[0] = sdescid; - - /* Attached tidflow gen/seq */ - scb->ips_lrh.mdata = tidrecvc->tidflow_genseq.psn_val; - - psm3_ips_proto_flow_enqueue(flow, scb); - flow->flush(flow, NULL); - - if (tidrecvc->protoexp->tid_flags & IPS_PROTOEXP_FLAG_CTS_SERIALIZED) { - flow->flags &= ~IPS_FLOW_FLAG_SKIP_CTS; /* Let the next CTS be processed */ - ips_tid_pendtids_timer_callback(&tidrecvc->protoexp->timer_getreqs, 0); /* and make explicit progress for it. */ - } -} -#endif // PSM_OPA #if defined(PSM_CUDA) || defined(PSM_ONEAPI) static @@ -879,10 +643,6 @@ ips_protoexp_tidsendc_complete(struct ips_tid_send_desc *tidsendc) { #ifdef PSM_VERBS struct ips_protoexp *protoexp = tidsendc->protoexp; -#elif defined(PSM_OPA) -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - struct ips_protoexp *protoexp = tidsendc->protoexp; -#endif #endif psm2_mq_req_t req = tidsendc->mqreq; @@ -971,96 +731,6 @@ ips_protoexp_rdma_write_completion(uint64_t wr_id) PSM2_LOG_MSG("leaving"); return IPS_RECVHDRQ_CONTINUE; } -#elif defined(PSM_OPA) -// sender processing of EXPTID_COMPLETION message from receiver indicating -// receiver has completely received a given TID sequence -int -ips_protoexp_recv_tid_completion(struct ips_recvhdrq_event *rcv_ev) -{ - struct ips_protoexp *protoexp = rcv_ev->proto->protoexp; - struct ips_message_header *p_hdr = rcv_ev->p_hdr; - struct ips_epaddr *ipsaddr = rcv_ev->ipsaddr; - ptl_arg_t desc_id = p_hdr->data[0]; - struct ips_tid_send_desc *tidsendc; - - _HFI_MMDBG("ips_protoexp_recv_tid_completion\n"); - PSM2_LOG_MSG("entering"); - PSM2_LOG_EPM(OPCODE_EXPTID_COMPLETION,PSM2_LOG_RX,rcv_ev->ipsaddr->epaddr.epid, - rcv_ev->proto->ep->mq->ep->epid,"desc_id._desc_idx: %d",desc_id._desc_idx); - - /* normal packet reliabilty protocol handling */ - if (!ips_proto_is_expected_or_nak(rcv_ev)) - { - PSM2_LOG_MSG("leaving"); - return IPS_RECVHDRQ_CONTINUE; - } - - if (__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) - ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, - &ipsaddr->flows[ips_proto_flowid(p_hdr)]); - - psm3_ips_proto_process_ack(rcv_ev); - - /* processing specific to tid_completion packet */ - /* - * Get the session send descriptor and complete. - */ - tidsendc = (struct ips_tid_send_desc *) - psm3_mpool_find_obj_by_index(protoexp->tid_desc_send_pool, - desc_id._desc_idx); - _HFI_VDBG("desc_id=%d (%p)\n", desc_id._desc_idx, tidsendc); - if (tidsendc == NULL) { - _HFI_ERROR - ("exptid comp: Index %d is out of range\n", - desc_id._desc_idx); - PSM2_LOG_MSG("leaving"); - return IPS_RECVHDRQ_CONTINUE; - } else { - ptl_arg_t desc_tidsendc; - - psm3_mpool_get_obj_index_gen_count(tidsendc, - &desc_tidsendc._desc_idx, - &desc_tidsendc._desc_genc); - - _HFI_VDBG("desc_req:id=%d,gen=%d desc_sendc:id=%d,gen=%d\n", - desc_id._desc_idx, desc_id._desc_genc, - desc_tidsendc._desc_idx, desc_tidsendc._desc_genc); - - /* See if the reference is still live and valid */ - if (desc_tidsendc.u64 != desc_id.u64) { - _HFI_ERROR("exptid comp: Genc %d does not match\n", - desc_id._desc_genc); - PSM2_LOG_MSG("leaving"); - return IPS_RECVHDRQ_CONTINUE; - } - } - - if (!STAILQ_EMPTY(&tidsendc->tidflow.scb_unacked)) { - struct ips_message_header hdr; - - /* Hack to handle the tidflow */ - hdr.data[0] = rcv_ev->p_hdr->data[0]; - hdr.ack_seq_num = rcv_ev->p_hdr->mdata; - hdr.khdr.kdeth0 = __cpu_to_le32(3 << HFI_KHDR_TIDCTRL_SHIFT); - rcv_ev->p_hdr = &hdr; - - /* - * This call should directly complete the tidflow - * and free all scb on the unacked queue. - */ - psm3_ips_proto_process_ack(rcv_ev); - - /* Keep KW happy. */ - rcv_ev->p_hdr = NULL; - /* Prove that the scb will not leak in the unacked queue: */ - psmi_assert(STAILQ_EMPTY(&tidsendc->tidflow.scb_unacked)); - } - - ips_protoexp_tidsendc_complete(tidsendc); - - PSM2_LOG_MSG("leaving"); - return IPS_RECVHDRQ_CONTINUE; -} #endif // defined(PSM_VERBS) #endif // PSM_HAVE_RDMA @@ -1501,44 +1171,19 @@ int ips_protoexp_handle_immed_data(struct ips_proto *proto, uint64_t conn_ref, PSM2_LOG_MSG("entering"); desc_id._desc_genc = RDMA_UNPACK_IMMED_GENC(immed); desc_id._desc_idx = RDMA_UNPACK_IMMED_IDX(immed); -#elif defined(PSM_OPA) -int ips_protoexp_data(struct ips_recvhdrq_event *rcv_ev) -{ - struct ips_proto *proto = rcv_ev->proto; - struct ips_protoexp *protoexp = proto->protoexp; - struct ips_message_header *p_hdr = rcv_ev->p_hdr; - struct ips_tid_recv_desc *tidrecvc; - ptl_arg_t desc_id; - psmi_seqnum_t sequence_num; - - psmi_assert(_get_proto_hfi_opcode(p_hdr) == OPCODE_EXPTID); - _HFI_MMDBG("ips_protoexp_data\n"); - // final packet in a TID sequence, we do some processing here - // for unaligned start and end bytes and send a OPCODE_EXPTID_COMPLETION - - PSM2_LOG_MSG("entering"); - - desc_id._desc_idx = ips_proto_flowid(p_hdr); - PSM2_LOG_EPM(OPCODE_EXPTID,PSM2_LOG_RX,rcv_ev->ipsaddr->epaddr.epid, - proto->ep->mq->ep->epid,"desc_id._desc_idx: %d", desc_id._desc_idx); - - desc_id._desc_genc = p_hdr->exp_rdescid_genc; #endif tidrecvc = &protoexp->tfc.tidrecvc[desc_id._desc_idx]; if ((tidrecvc->rdescid._desc_genc & IPS_HDR_RDESCID_GENC_MASK) != desc_id._desc_genc) { -#ifndef PSM_OPA _HFI_ERROR("stale inbound rv RDMA generation: expected %u got %u\n", tidrecvc->rdescid._desc_genc, desc_id._desc_genc); tidrecvc->stats.nGenErr++; -#endif PSM2_LOG_MSG("leaving"); return IPS_RECVHDRQ_CONTINUE; /* skip */ } -#ifndef PSM_OPA // maybe should use assert below so don't add test in production code if (tidrecvc->state != TIDRECVC_STATE_BUSY) { _HFI_ERROR("stale inbound rv RDMA (tidrecvc not busy)\n"); @@ -1594,31 +1239,12 @@ int ips_protoexp_data(struct ips_recvhdrq_event *rcv_ev) #endif _HFI_PDBG_DUMP_ALWAYS(tidrecvc->buffer, len); } -#else // PSM_OPA - /* IBTA CCA handling for expected flow. */ - if (rcv_ev->is_congested & IPS_RECV_EVENT_FECN) { - /* Mark flow to generate BECN in control packet */ - tidrecvc->tidflow.flags |= IPS_FLOW_FLAG_GEN_BECN; - /* Update stats for congestion encountered */ - proto->epaddr_stats.congestion_pkts++; - /* Clear FECN event */ - rcv_ev->is_congested &= ~IPS_RECV_EVENT_FECN; - } - - sequence_num.psn_val = __be32_to_cpu(p_hdr->bth[2]); - - if_pf (PSM_HAL_ERROR_OK != psmi_hal_tidflow_check_update_pkt_seq( - protoexp,sequence_num,tidrecvc,p_hdr, - ips_protoexp_do_tf_generr,ips_protoexp_do_tf_seqerr)) - return IPS_RECVHDRQ_CONTINUE; -#endif // PSM_OPA /* Reset the swapped generation count as we received a valid packet */ tidrecvc->tidflow_nswap_gen = 0; /* Do some sanity checking */ psmi_assert_always(tidrecvc->state == TIDRECVC_STATE_BUSY); -#ifndef PSM_OPA // STL100 does this at the end of ips_protoexp_send_tid_completion // TBD - seems like this should be done after ips_tid_recv_free // so we have more likelihood of getting freshly freed resources? @@ -1626,150 +1252,17 @@ int ips_protoexp_data(struct ips_recvhdrq_event *rcv_ev) tidrecvc->ipsaddr->flows[protoexp->proto->msgflowid].flags &= ~IPS_FLOW_FLAG_SKIP_CTS; /* Let the next CTS be processed */ ips_tid_pendtids_timer_callback(&tidrecvc->protoexp->timer_getreqs, 0); /* and make explicit progress for it. */ } -#else - int recv_completion = (tidrecvc->recv_tidbytes == - (p_hdr->exp_offset + ips_recvhdrq_event_paylen(rcv_ev))); - - /* If sender requested an ACK with the packet and it is not the last - * packet, or if the incoming flow faced congestion, respond with an - * ACK packet. The ACK when congested will have the BECN bit set. - */ - if (((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) && - !recv_completion) || - (tidrecvc->tidflow.flags & IPS_FLOW_FLAG_GEN_BECN)) { - ips_scb_t ctrlscb; - - /* Ack sender with descriptor index */ - ctrlscb.scb_flags = 0; - ctrlscb.ips_lrh.data[0] = p_hdr->exp_sdescid; - ctrlscb.ips_lrh.ack_seq_num = tidrecvc->tidflow_genseq.psn_val; - - // no payload, pass cksum so non-NULL - psm3_ips_proto_send_ctrl_message(&tidrecvc->tidflow, - OPCODE_ACK, - &tidrecvc->ctrl_msg_queued, - &ctrlscb, ctrlscb.cksum, 0); - } - - /* If RSM is a HW capability, and RSM has found a TID packet marked - * with FECN, the payload will be written to the eager buffer, and - * we will have a payload pointer here. In that case, copy the payload - * into the user's buffer. If RSM did not intercept this EXPTID - * packet, the HFI will handle the packet payload. Possibly should - * assert(0 < paylen < MTU). - */ - if (psmi_hal_has_cap(PSM_HAL_CAP_RSM_FECN_SUPP) && - ips_recvhdrq_event_payload(rcv_ev) && - ips_recvhdrq_event_paylen(rcv_ev)) - psm3_mq_mtucpy(tidrecvc->buffer + p_hdr->exp_offset, - ips_recvhdrq_event_payload(rcv_ev), - ips_recvhdrq_event_paylen(rcv_ev)); - - /* If last packet then we are done. We send a tid transfer completion - * packet back to sender, free all tids and close the current tidflow - * as well as tidrecvc descriptor. - * Note: If we were out of tidflow, this will invoke the callback to - * schedule pending transfer. - */ - if (recv_completion) { - /* copy unaligned data if any */ - uint8_t *dst, *src; - - if (tidrecvc->tid_list.tsess_unaligned_start) { - dst = (uint8_t *)tidrecvc->buffer; - src = (uint8_t *)p_hdr->exp_ustart; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - if (tidrecvc->is_ptr_gpu_backed) { - PSM3_GPU_MEMCPY_HTOD(dst, src, - tidrecvc->tid_list.tsess_unaligned_start); - } else -#endif - ips_protoexp_unaligned_copy(dst, src, - tidrecvc->tid_list.tsess_unaligned_start); - } - - if (tidrecvc->tid_list.tsess_unaligned_end) { - dst = (uint8_t *)tidrecvc->buffer + - tidrecvc->recv_msglen - - tidrecvc->tid_list.tsess_unaligned_end; - src = (uint8_t *)p_hdr->exp_uend; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - if (tidrecvc->is_ptr_gpu_backed) { - PSM3_GPU_MEMCPY_HTOD(dst, src, - tidrecvc->tid_list.tsess_unaligned_end); - } else -#endif - ips_protoexp_unaligned_copy(dst, src, - tidrecvc->tid_list.tsess_unaligned_end); - } - - /* reply tid transfer completion packet to sender */ - ips_protoexp_send_tid_completion(tidrecvc, p_hdr->exp_sdescid); -#endif /* Mark receive as done */ ips_tid_recv_free(tidrecvc); _HFI_MMDBG("tidrecv done\n"); -#ifdef PSM_OPA - } -#endif PSM2_LOG_MSG("leaving"); return IPS_RECVHDRQ_CONTINUE; } #endif // PSM_HAVE_RDMA -#ifdef PSM_OPA -#ifndef PSM_DEBUG -# define ips_dump_tids(tid_list, msg, ...) -#else -static -void ips_dump_tids(ips_tid_session_list *tid_list, const char *msg, ...) -{ - char buf[256]; - size_t off = 0; - int i, num_tids = tid_list->tsess_tidcount; - va_list argptr; - va_start(argptr, msg); - off += vsnprintf(buf, sizeof(buf) - off, msg, argptr); - va_end(argptr); - - for (i = 0; i < num_tids && off < (sizeof(buf) - 1); i++) - off += snprintf(buf + off, sizeof(buf) - off, "%d%s", - IPS_TIDINFO_GET_TID(tid_list->tsess_list[i]), - i < num_tids - 1 ? "," : ""); - - _HFI_VDBG("%s\n", buf); - return; -} -#endif -#endif // PSM_OPA - -#ifdef PSM_OPA -static -void ips_expsend_tiderr(struct ips_tid_send_desc *tidsendc) -{ - char buf[256]; - size_t off = 0; - int i; - - off += snprintf(buf + off, sizeof(buf) - off, - "Remaining bytes: %d Member id %d is not in tid_session_id=%d :", - tidsendc->remaining_tidbytes, tidsendc->tid_idx, - tidsendc->rdescid._desc_idx); - - for (i = 0; i < tidsendc->tid_list.tsess_tidcount + 1; i++) - off += snprintf(buf + off, sizeof(buf) - off, "%d,", - IPS_TIDINFO_GET_TID(tidsendc->tid_list. - tsess_list[i])); - psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, - "Trying to use tid idx %d and there are %d members: %s\n", - tidsendc->tid_idx, tidsendc->tid_list.tsess_tidcount, - buf); - return; -} -#endif // PSM_OPA #if defined(PSM_CUDA) || defined(PSM_ONEAPI) static @@ -1833,11 +1326,7 @@ void psmi_cuda_run_prefetcher(struct ips_protoexp *protoexp, offset = req->prefetch_send_msgoff; window_len = ips_cuda_next_window( -#ifdef PSM_OPA - tidsendc->ipsaddr->opa.window_rv, -#else proto->mq->hfi_base_window_rv, -#endif offset, req->req_data.buf_len); unsigned bufsz = 0; if (window_len <= CUDA_SMALLHOSTBUF_SZ) { @@ -1888,11 +1377,7 @@ void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp, offset = req->prefetch_send_msgoff; window_len = ips_cuda_next_window( -#ifdef PSM_OPA - tidsendc->ipsaddr->opa.window_rv, -#else proto->mq->hfi_base_window_rv, -#endif offset, req->req_data.buf_len); unsigned bufsz = 0; if (window_len <= CUDA_SMALLHOSTBUF_SZ) { @@ -2023,20 +1508,10 @@ psm3_ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp, uint32_t tid_list_size) { struct ips_tid_send_desc *tidsendc; -#ifdef PSM_OPA - uint32_t i, j, *src, *dst; -#endif _HFI_MMDBG("psm3_ips_tid_send_handle_tidreq\n"); PSM2_LOG_MSG("entering"); -#ifdef PSM_OPA - psmi_assert(tid_list_size > sizeof(ips_tid_session_list)); - psmi_assert(tid_list_size <= sizeof(tidsendc->filler)); - psmi_assert(tid_list->tsess_tidcount > 0); - psmi_assert((rdescid._desc_genc>>16) == 0); -#else psmi_assert(tid_list_size == sizeof(ips_tid_session_list)); -#endif tidsendc = (struct ips_tid_send_desc *) psm3_mpool_get(protoexp->tid_desc_send_pool); @@ -2070,89 +1545,15 @@ psm3_ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp, _HFI_VDBG("recv'd CTS: rkey 0x%x srcoff %u raddr 0x%"PRIx64" len %u\n", tid_list->tsess_rkey, tid_list->tsess_srcoff, tid_list->tsess_raddr, tid_list->tsess_length); -#elif defined(PSM_OPA) - /* - * while doing the copy, we try to merge the tids based on - * following rules: - * 1. both tids are virtually contiguous(i and i+1 in the array); - * 2. both tids have the same tidpair value; - * 3. first tid (i) has tidctrl=1; - * 4. second tid (i+1) has tidctrl=2; - * 5. total length does not exceed 512 pages (2M); - * 6. The h/w supports merged tid_ctrl's. - * - * The restriction of 512 pages comes from the limited number - * of bits we have for KDETH.OFFSET: - * - The entire mapping space provided through TIDs is to be - * viewed as a zero-based address mapping. - * - We have 15 bits in KDETH offset field through which we - * can address upto a maximum of 2MB. - * (with 64-byte offset mode or KDETH.OM = 1) - * - Assuming a 4KB page size, 2MB/4KB = 512 pages. - */ - ips_dump_tids(tid_list, "Received %d tids: ", - tid_list->tsess_tidcount); - - if (psmi_hal_has_cap(PSM_HAL_CAP_MERGED_TID_CTRLS)) - { - src = tid_list->tsess_list; - dst = tidsendc->tid_list.tsess_list; - dst[0] = src[0]; - j = 0; i = 1; - while (i < tid_list->tsess_tidcount) { - if ((((dst[j]>>IPS_TIDINFO_TIDCTRL_SHIFT)+1) == - (src[i]>>IPS_TIDINFO_TIDCTRL_SHIFT)) && - (((dst[j]&IPS_TIDINFO_LENGTH_MASK)+ - (src[i]&IPS_TIDINFO_LENGTH_MASK)) <= - PSM_MAX_NUM_PAGES_IN_TIDPAIR)) { - /* merge 'i' to 'j' - * (We need to specify "tidctrl" value as 3 - * if we merge the individual tid-pairs. - * Doing that here) */ - dst[j] += (2 << IPS_TIDINFO_TIDCTRL_SHIFT) + - (src[i] & IPS_TIDINFO_LENGTH_MASK); - i++; - if (i == tid_list->tsess_tidcount) break; - } - j++; - /* copy 'i' to 'j' */ - dst[j] = src[i]; - i++; - } - tidsendc->tid_list.tsess_tidcount = j + 1; - tid_list = &tidsendc->tid_list; - } - else - { - tidsendc->tid_list.tsess_tidcount = tid_list->tsess_tidcount; - psm3_mq_mtucpy(&tidsendc->tid_list.tsess_list, tid_list->tsess_list, - tid_list->tsess_tidcount * sizeof(tid_list->tsess_list[0])); - tid_list = &tidsendc->tid_list; - } - - /* Initialize tidflow for window. Use path requested by remote endpoint */ - psm3_ips_flow_init(&tidsendc->tidflow, protoexp->proto, ipsaddr, - protoexp->tid_xfer_type, PSM_PROTOCOL_TIDFLOW, - IPS_PATH_LOW_PRIORITY, EP_FLOW_TIDFLOW); - tidsendc->tidflow.xmit_seq_num.psn_val = tidflow_genseq; - tidsendc->tidflow.xmit_ack_num.psn_val = tidflow_genseq; - tidsendc->frag_size = min(protoexp->tid_send_fragsize, - tidsendc->tidflow.frag_size); #endif // defined(PSM_VERBS) tidsendc->userbuf = (void *)((uintptr_t) req->req_data.buf + tid_list->tsess_srcoff); tidsendc->buffer = (void *)((uintptr_t)tidsendc->userbuf -#ifdef PSM_OPA - + tid_list->tsess_unaligned_start -#endif ); tidsendc->length = tid_list->tsess_length; _HFI_MMDBG("tidsendc created userbuf %p buffer %p length %u\n", tidsendc->userbuf, tidsendc->buffer, tidsendc->length); -#ifdef PSM_OPA - tidsendc->ctrl_msg_queued = 0; -#endif #if defined(PSM_CUDA) || defined(PSM_ONEAPI) /* Matching on previous prefetches and initiating next prefetch */ @@ -2184,9 +1585,6 @@ psm3_ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp, tid_list->tsess_srcoff - chb->offset); tidsendc->buffer = (void *)((uintptr_t)tidsendc->userbuf -#ifdef PSM_OPA - + tid_list->tsess_unaligned_start -#endif ); /* now associate the buffer with the tidsendc */ tidsendc->cuda_hostbuf[0] = chb; @@ -2199,9 +1597,6 @@ psm3_ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp, (void *)((uintptr_t) buffer); tidsendc->buffer = (void *)((uintptr_t)tidsendc->userbuf -#ifdef PSM_OPA - + tid_list->tsess_unaligned_start -#endif ); chb_next = STAILQ_NEXT(chb, req_next); tidsendc->cuda_hostbuf[0] = chb; @@ -2213,11 +1608,7 @@ psm3_ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp, chb, tid_list->tsess_srcoff, tid_list->tsess_length, -#ifdef PSM_OPA - tid_list->tsess_unaligned_start, -#else 0, -#endif rc); } else { psmi_attach_chb_to_tidsendc(protoexp, req, @@ -2225,11 +1616,7 @@ psm3_ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp, NULL, tid_list->tsess_srcoff, tid_list->tsess_length, -#ifdef PSM_OPA - tid_list->tsess_unaligned_start, -#else 0, -#endif PSMI_CUDA_CONTINUE); } protoexp->proto->strat_stats.rndv_rdma_hbuf_send++; @@ -2244,44 +1631,20 @@ psm3_ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp, protoexp->proto->strat_stats.rndv_rdma_cpu_send_bytes += tid_list->tsess_length; } -#ifdef PSM_OPA - /* frag size must be 64B multiples */ - tidsendc->frag_size &= (~63); -#endif tidsendc->is_complete = 0; -#ifdef PSM_OPA - tidsendc->tid_idx = 0; - tidsendc->frame_send = 0; -#else tidsendc->reserved = 0; #ifdef PSM_HAVE_RNDV_MOD tidsendc->rv_need_err_chk_rdma = 0; tidsendc->rv_sconn_index = 0; tidsendc->rv_conn_count = 0; #endif -#endif -#ifdef PSM_OPA - tidsendc->tidbytes = 0; - tidsendc->remaining_tidbytes = tid_list->tsess_length - - tid_list->tsess_unaligned_start - tid_list->tsess_unaligned_end; - tidsendc->remaining_bytes_in_tid = - (IPS_TIDINFO_GET_LENGTH(tid_list->tsess_list[0]) << 12) - - tid_list->tsess_tidoffset; - tidsendc->offset_in_tid = tid_list->tsess_tidoffset; -#endif _HFI_EXP ("alloc tidsend=%4d tidrecv=%4d srcoff=%6d length=%6d" -#ifdef PSM_OPA - ",s=%d,e=%d" -#endif "\n", tidsendc->sdescid._desc_idx, rdescid._desc_idx, tid_list->tsess_srcoff, tid_list->tsess_length -#ifdef PSM_OPA - , tid_list->tsess_unaligned_start, tid_list->tsess_unaligned_end -#endif ); // start sending TIDEXP packets @@ -2299,231 +1662,6 @@ psm3_ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp, return PSM2_OK; } -#ifdef PSM_OPA -// compose a sequence of EXPTID packets to be sent -// builds one scb with proper headers and tids. When using PIO -// the scb is for a single packet. When using SDMA, header generation -// will let the scb describe a frag_size larger than a packet -static -ips_scb_t * -ips_scb_prepare_tid_sendctrl(struct ips_flow *flow, - struct ips_tid_send_desc *tidsendc) -{ - struct ips_protoexp *protoexp = tidsendc->protoexp; - uint32_t *tsess_list = tidsendc->tid_list.tsess_list; - uint32_t tid, omode, offset, chunk_size; - uint32_t startidx, endidx; - uint32_t frame_len, nfrag; - uint8_t *bufptr = tidsendc->buffer; - ips_scb_t *scb; - - uint8_t is_payload_per_frag_leq_8dw = 0; - /* If payload in the first and last nfrag is less then or equal - * to 8DW we disable header suppression so as to detect uncorrectable - * errors which will otherwise be non-detectable(since header is - * suppressed we lose RHF.EccErr) - */ - if ((scb = psm3_ips_scbctrl_alloc(&protoexp->tid_scbc_rv, 1, 0, 0)) == NULL) - return NULL; - - /* - * Make sure the next offset is in 64B multiples with the tid. - */ - frame_len = - min(tidsendc->remaining_bytes_in_tid, tidsendc->remaining_tidbytes); - if (frame_len > tidsendc->frag_size) { - frame_len = - tidsendc->frag_size - (tidsendc->offset_in_tid & 63); - } - /* - * Frame length is the amount of payload to be included in a particular - * frag of the scb, so we check if frame len is less than or equal - * to 8DW. If length is less then then or equal to 8DW for the first - * frag then we avoid header suppression - */ - if (frame_len <= 32) - is_payload_per_frag_leq_8dw = 1; - - /* - * Using large offset mode based on offset length. - */ - if (tidsendc->offset_in_tid < 131072) { /* 2^15 * 4 */ - psmi_assert((tidsendc->offset_in_tid % 4) == 0); - offset = tidsendc->offset_in_tid / 4; - omode = 0; - } else { - psmi_assert((tidsendc->offset_in_tid % 64) == 0); - offset = tidsendc->offset_in_tid / 64; - omode = 1; - } - startidx = tidsendc->tid_idx; - tid = IPS_TIDINFO_GET_TID(tsess_list[startidx]); - scb->ips_lrh.khdr.kdeth0 = __cpu_to_le32((offset & HFI_KHDR_OFFSET_MASK) - | (omode << HFI_KHDR_OM_SHIFT) | (tid << HFI_KHDR_TID_SHIFT)); - - scb->tidctrl = IPS_TIDINFO_GET_TIDCTRL(tsess_list[startidx]); - scb->tsess = (uint32_t *) &tsess_list[startidx]; - - /* - * Payload and buffer address for current packet. payload_size - * must be the first packet size because it is used to initialize - * the packet header. - */ - scb->payload_size = frame_len; - ips_scb_buffer(scb) = (void *)bufptr; - scb->frag_size = tidsendc->frag_size; - - /* - * Other packet fields. - */ - PSM2_LOG_EPM(OPCODE_EXPTID,PSM2_LOG_TX, protoexp->proto->ep->epid, - flow->ipsaddr->epaddr.epid, - "psm3_mpool_get_obj_index(tidsendc->mqreq): %d, tidsendc->rdescid._desc_idx: %d, tidsendc->sdescid._desc_idx: %d", - psm3_mpool_get_obj_index(tidsendc->mqreq),tidsendc->rdescid._desc_idx,tidsendc->sdescid._desc_idx); - ips_scb_opcode(scb) = OPCODE_EXPTID; - scb->ips_lrh.exp_sdescid = tidsendc->sdescid; - scb->ips_lrh.exp_rdescid_genc = (uint16_t)tidsendc->rdescid._desc_genc; - scb->ips_lrh.exp_offset = tidsendc->tidbytes; - - scb->tidsendc = tidsendc; - SLIST_NEXT(scb, next) = NULL; - - /* - * Loop over the tid session list, count the frag number and payload size. - */ - nfrag = 1; - chunk_size = frame_len; - while (1) { - /* Record last tididx used */ - endidx = tidsendc->tid_idx; - /* Check if all tidbytes are done */ - tidsendc->remaining_tidbytes -= frame_len; - if (!tidsendc->remaining_tidbytes) { - /* We do another frame length check for the last frag */ - if (frame_len <= 32) - is_payload_per_frag_leq_8dw = 1; - break; - } - - /* Update in current tid */ - tidsendc->remaining_bytes_in_tid -= frame_len; - tidsendc->offset_in_tid += frame_len; - psmi_assert((tidsendc->offset_in_tid >= 128*1024) ? - ((tidsendc->offset_in_tid % 64) == 0) : - ((tidsendc->offset_in_tid % 4) == 0)); - - /* Done with this tid, move on to the next tid */ - if (!tidsendc->remaining_bytes_in_tid) { - tidsendc->tid_idx++; - psmi_assert_always(tidsendc->tid_idx < - tidsendc->tid_list.tsess_tidcount); - tidsendc->remaining_bytes_in_tid = - IPS_TIDINFO_GET_LENGTH(tsess_list - [tidsendc->tid_idx]) << 12; - tidsendc->offset_in_tid = 0; - } - - /* For PIO, only single packet per scb allowed */ - if (flow->transfer == PSM_TRANSFER_PIO) { - break; - } - - frame_len = - min(tidsendc->remaining_bytes_in_tid, - tidsendc->remaining_tidbytes); - if (frame_len > tidsendc->frag_size) - frame_len = tidsendc->frag_size; - nfrag++; - chunk_size += frame_len; - } - - scb->nfrag = nfrag; - scb->chunk_size = chunk_size; - if (nfrag > 1) { - scb->nfrag_remaining = scb->nfrag; - scb->chunk_size_remaining = scb->chunk_size; - } - scb->tsess_length = (endidx - startidx + 1) * sizeof(uint32_t); - - /* Keep track of latest buffer location so we restart at the - * right location, if we don't complete the transfer */ - tidsendc->buffer = bufptr + chunk_size; - tidsendc->tidbytes += chunk_size; - - if (flow->transfer == PSM_TRANSFER_DMA && - psmi_hal_has_cap(PSM_HAL_CAP_DMA_HSUPP_FOR_32B_MSGS)) { - is_payload_per_frag_leq_8dw = 0; - } - - /* If last packet, we want a completion notification */ - if (!tidsendc->remaining_tidbytes) { - /* last packet/chunk, attach unaligned data */ - uint8_t *dst, *src; - - if (tidsendc->tid_list.tsess_unaligned_start) { - dst = (uint8_t *)scb->ips_lrh.exp_ustart; - src = (uint8_t *)tidsendc->userbuf; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - if (IS_TRANSFER_BUF_GPU_MEM(scb) && !tidsendc->mqreq->gpu_hostbuf_used) { - PSM3_GPU_MEMCPY_DTOH(dst, src, - tidsendc->tid_list.tsess_unaligned_start); - } else -#endif - ips_protoexp_unaligned_copy(dst, src, - tidsendc->tid_list.tsess_unaligned_start); - } - - if (tidsendc->tid_list.tsess_unaligned_end) { - dst = (uint8_t *)&scb->ips_lrh.exp_uend; - src = (uint8_t *)tidsendc->userbuf + - tidsendc->length - - tidsendc->tid_list.tsess_unaligned_end; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - if (IS_TRANSFER_BUF_GPU_MEM(scb) && !tidsendc->mqreq->gpu_hostbuf_used) { - PSM3_GPU_MEMCPY_DTOH(dst, src, - tidsendc->tid_list.tsess_unaligned_end); - } else -#endif - ips_protoexp_unaligned_copy(dst, src, - tidsendc->tid_list.tsess_unaligned_end); - } - /* - * If the number of fragments is greater then one and - * "no header suppression" flag is unset then we go - * ahead and suppress the header */ - if ((scb->nfrag > 1) && (!is_payload_per_frag_leq_8dw)) - scb->scb_flags |= IPS_SEND_FLAG_HDRSUPP; - else - scb->scb_flags |= IPS_SEND_FLAG_ACKREQ; - - tidsendc->is_complete = 1; // all scb's queued for send - } else { - /* Do not suppress header every hdr_pkt_interval */ - if ((++tidsendc->frame_send % - protoexp->hdr_pkt_interval) == 0) - /* Request an ACK */ - scb->scb_flags |= IPS_SEND_FLAG_ACKREQ; - else { - if (!is_payload_per_frag_leq_8dw) { - /* Request hdr supp */ - scb->scb_flags |= IPS_SEND_FLAG_HDRSUPP; - } - } - /* assert only single packet per scb */ - psmi_assert(scb->nfrag == 1); - } - -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - if (tidsendc->mqreq->is_buf_gpu_mem && /* request's buffer comes from GPU realm */ - !tidsendc->mqreq->gpu_hostbuf_used) { /* and it was NOT moved to HOST memory */ - scb->mq_req = tidsendc->mqreq; /* so let's mark it per scb, not to check its locality again */ - ips_scb_flags(scb) |= IPS_SEND_FLAG_PAYLOAD_BUF_GPU; - } -#endif - - return scb; -} -#endif // PSM_OPA #if defined(PSM_VERBS) /* @@ -2694,16 +1832,8 @@ psm2_error_t ips_tid_issue_rdma_write(struct ips_tid_send_desc *tidsendc) static psm2_error_t ips_tid_send_exp(struct ips_tid_send_desc *tidsendc) { -#ifdef PSM_OPA - ips_scb_t *scb = NULL; -#endif psm2_error_t err = PSM2_OK; -#ifdef PSM_OPA - psm2_error_t err_f; - struct ips_protoexp *protoexp = tidsendc->protoexp; - struct ips_proto *proto = protoexp->proto; - struct ips_flow *flow = &tidsendc->tidflow; -#elif defined(PSM_CUDA) || defined(PSM_ONEAPI) +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) struct ips_protoexp *protoexp = tidsendc->protoexp; #endif @@ -2774,48 +1904,7 @@ psm2_error_t ips_tid_send_exp(struct ips_tid_send_desc *tidsendc) tidsendc->cuda_hostbuf[1] = NULL; } #endif -#ifdef PSM_OPA - /* - * We aggressively try to grab as many scbs as possible, enqueue them to a - * flow and flush them when either we're out of scbs or we've completely - * filled the send request. - */ - while (!tidsendc->is_complete) { - if_pf(tidsendc->tid_list.tsess_tidcount && - (tidsendc->tid_idx >= tidsendc->tid_list.tsess_tidcount || - tidsendc->tid_idx < 0)) - ips_expsend_tiderr(tidsendc); - - if ((scb = - ips_scb_prepare_tid_sendctrl(flow, tidsendc)) == NULL) { - proto->stats.scb_exp_unavail_cnt++; - err = PSM2_EP_NO_RESOURCES; - break; - } else { - // queue up the sends, likely to be SDMA - psm3_ips_proto_flow_enqueue(flow, scb); - } - } - - if (!SLIST_EMPTY(&flow->scb_pend)) { /* Something to flush */ - int num_sent; - - // this will kick off the sends, likely to be SDMA - err_f = flow->flush(flow, &num_sent); - - // since we are using the tidflow, we ensure a future - // timer callback will flush the remaining scbs or - // process the rcvhdrq - if (err != PSM2_EP_NO_RESOURCES) { - /* PSM2_EP_NO_RESOURCES is reserved for out-of-scbs */ - if (err_f == PSM2_EP_NO_RESOURCES) - err = PSM2_TIMEOUT; /* force a resend reschedule */ - else if (err_f == PSM2_OK && num_sent > 0 && - !psm3_gen1_ips_ptl_recvq_isempty(protoexp->ptl)) - err = PSM2_OK_NO_PROGRESS; /* force a rcvhdrq service */ - } - } -#elif defined(PSM_VERBS) +#if defined(PSM_VERBS) err = ips_tid_issue_rdma_write(tidsendc); #endif @@ -2918,123 +2007,6 @@ ips_tid_pendsend_timer_callback(struct psmi_timer *timer, uint64_t current) and allows for a single call to the core VM code in the kernel, rather than one per page, definitely improving performance. */ -#ifdef PSM_OPA -static -psm2_error_t -ips_tid_recv_alloc_frag(struct ips_protoexp *protoexp, - struct ips_tid_recv_desc *tidrecvc, - uint32_t nbytes_this) -{ - ips_tid_session_list *tid_list = &tidrecvc->tid_list; - uintptr_t bufptr = (uintptr_t) tidrecvc->buffer; - uint32_t size = nbytes_this; - psm2_error_t err = PSM2_OK; - uintptr_t pageaddr; - uint32_t tidoff, pageoff, pagelen, reglen, num_tids; - - psmi_assert(size >= 4); - - /* - * The following calculation does not work when size < 4 - * and bufptr is byte aligned, it can get negative value. - */ - tid_list->tsess_unaligned_start = (bufptr & 3) ? (4 - (bufptr & 3)) : 0; - size -= tid_list->tsess_unaligned_start; - bufptr += tid_list->tsess_unaligned_start; - - tid_list->tsess_unaligned_end = size & 3; - size -= tid_list->tsess_unaligned_end; - - psmi_assert(size > 0); - -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - /* Driver pins GPU pages when using GPU Direct RDMA for TID recieves, - * to accomadate this change the calculations of pageaddr, pagelen - * and pageoff have been modified to take GPU page size into - * consideration. - */ - if (tidrecvc->is_ptr_gpu_backed) { - uint64_t page_mask = ~(PSMI_GPU_PAGESIZE -1); - uint32_t page_offset_mask = (PSMI_GPU_PAGESIZE -1); - pageaddr = bufptr & page_mask; - pagelen = (uint32_t) (PSMI_GPU_PAGESIZE + - ((bufptr + size - 1) & page_mask) - - (bufptr & page_mask)); - tidoff = pageoff = (uint32_t) (bufptr & page_offset_mask); - } else -#endif - { - pageaddr = bufptr & protoexp->tid_page_mask; - pagelen = (uint32_t) (PSMI_PAGESIZE + - ((bufptr + size - 1) & protoexp->tid_page_mask) - - (bufptr & protoexp->tid_page_mask)); - tidoff = pageoff = (uint32_t) (bufptr & protoexp->tid_page_offset_mask); - } - - reglen = pagelen; - if (protoexp->tidc.tid_array) { - if ((err = ips_tidcache_acquire(&protoexp->tidc, - (void *)pageaddr, ®len, - (uint32_t *) tid_list->tsess_list, &num_tids, - &tidoff -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - , tidrecvc->is_ptr_gpu_backed -#endif - ))) - goto fail; - } else { - if ((err = ips_tid_acquire(&protoexp->tidc, - (void *)pageaddr, ®len, - (uint32_t *) tid_list->tsess_list, &num_tids -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - , tidrecvc->is_ptr_gpu_backed -#endif - ))) - goto fail; - } - - /* - * PSM2 currently provides storage space enough to hold upto - * 1024 tids. (PSM_TIDLIST_BUFSIZE). So, make sure we - * don't get more than what we can hold from the tidcache here. - * - * The reason for 1024 tids comes from the PSM_TID_WINSIZE value - * (currently 4MB. So, if in future, there is a change to this macro, - * then you would need a change to PSM_TIDLIST_BUFSIZE as well). - * - * Assuming a 4KB page size, to be able to receive - * a message of 4MB size, we'd need an maximum of 4MB/4KB = 1024 tids. - */ - psmi_assert(num_tids > 0); - psmi_assert(num_tids <= (PSM_TID_WINSIZE/PSM_TIDLIST_BUFSIZE)); - if (reglen > pagelen) { - err = psm3_handle_error(protoexp->tidc.context->ep, - PSM2_EP_DEVICE_FAILURE, - "PSM tid registration: " - "register more pages than asked"); - goto fail; - } else if (reglen < pagelen) { - /* - * driver registered less pages, update PSM records. - */ - tid_list->tsess_unaligned_end = 0; - tidrecvc->recv_tidbytes = reglen - pageoff; - tidrecvc->recv_msglen = tid_list->tsess_unaligned_start + - tidrecvc->recv_tidbytes; - } else { - tidrecvc->recv_tidbytes = size; - tidrecvc->recv_msglen = nbytes_this; - } - - tid_list->tsess_tidcount = num_tids; - tid_list->tsess_tidoffset = tidoff; - - ips_dump_tids(tid_list, "Registered %d tids: ", num_tids); - -fail: - return err; -} -#endif // PSM_OPA static psm2_error_t @@ -3045,9 +2017,6 @@ ips_tid_recv_alloc(struct ips_protoexp *protoexp, { psm2_error_t err; ips_scb_t *grantscb; -#ifdef PSM_OPA - ips_scb_t *completescb; -#endif #ifdef PSM_VERBS psm2_mq_req_t req = getreq->tidgr_req; #elif defined(PSM_CUDA) || defined(PSM_ONEAPI) @@ -3075,24 +2044,11 @@ ips_tid_recv_alloc(struct ips_protoexp *protoexp, return PSM2_EP_NO_RESOURCES; } -#ifdef PSM_OPA - /* 2. allocate a tid complete (final ACK) scb. */ - completescb = psm3_ips_scbctrl_alloc(&protoexp->tid_scbc_rv, 1, 0, 0); - if (completescb == NULL) { - psm3_ips_scbctrl_free(grantscb); - /* ips_tid_scbavail_callback() will reschedule */ - PSM2_LOG_MSG("leaving"); - return PSM2_EP_NO_RESOURCES; - } -#endif /* 3. allocate a tid flow entry. */ err = psm3_ips_tf_allocate(&protoexp->tfc, &tidrecvc); if (err != PSM2_OK) { _HFI_MMDBG("Wait: NO tid flow\n"); -#ifdef PSM_OPA - psm3_ips_scbctrl_free(completescb); -#endif psm3_ips_scbctrl_free(grantscb); /* Unable to get a tidflow for expected protocol. */ psmi_timer_request(protoexp->timerq, @@ -3131,9 +2087,6 @@ ips_tid_recv_alloc(struct ips_protoexp *protoexp, * Release the resources we're holding and reschedule.*/ psm3_ips_tf_deallocate(&protoexp->tfc, tidrecvc->rdescid._desc_idx, 0); -#ifdef PSM_OPA - psm3_ips_scbctrl_free(completescb); -#endif psm3_ips_scbctrl_free(grantscb); psmi_timer_request(protoexp->timerq, &protoexp->timer_getreqs, @@ -3198,70 +2151,17 @@ ips_tid_recv_alloc(struct ips_protoexp *protoexp, } tidrecvc->recv_msglen = nbytes_this; -#elif defined(PSM_OPA) - /* 5. allocate some tids from driver. */ - err = ips_tid_recv_alloc_frag(protoexp, tidrecvc, nbytes_this); - if (err != PSM2_OK) { -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - if (chb) - psm3_mpool_put(chb); -#endif - psm3_ips_tf_deallocate(&protoexp->tfc, tidrecvc->rdescid._desc_idx, 0); - psm3_ips_scbctrl_free(completescb); - psm3_ips_scbctrl_free(grantscb); - /* Unable to register tids */ - psmi_timer_request(protoexp->timerq, - &protoexp->timer_getreqs, PSMI_TIMER_PRIO_1); - PSM2_LOG_MSG("leaving"); - return err; - } - - if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_TID_DEBUG) { - int num_tids = tidrecvc->tid_list.tsess_tidcount; - int tid, i; - for (i = 0; i < num_tids; i++) { - tid = - IPS_TIDINFO_GET_TID(tidrecvc->tid_list. - tsess_list[i]) * 2 + - IPS_TIDINFO_GET_TIDCTRL(tidrecvc->tid_list. - tsess_list[i]) - 1; - psmi_assert(protoexp->tid_info[tid].state == - TIDSTATE_FREE); - psmi_assert(protoexp->tid_info[tid].tidrecvc == NULL); - psmi_assert(protoexp->tid_info[tid].tid == 0xFFFFFFFF); - protoexp->tid_info[tid].state = TIDSTATE_USED; - protoexp->tid_info[tid].tidrecvc = tidrecvc; - protoexp->tid_info[tid].tid = - tidrecvc->tid_list.tsess_list[i]; - } - } #endif /* Initialize recv descriptor */ tidrecvc->ipsaddr = ipsaddr; tidrecvc->getreq = (struct ips_tid_get_request *)getreq; -#ifdef PSM_OPA - /* Initialize tidflow, instead calling generic routine: - psm3_ips_flow_init(&tidrecvc->tidflow, protoexp->proto, ipsaddr, - protoexp->ctrl_xfer_type, PSM_PROTOCOL_TIDFLOW, - IPS_PATH_LOW_PRIORITY, EP_FLOW_TIDFLOW); - * only reset following necessary field. */ - tidrecvc->tidflow.ipsaddr = ipsaddr; - tidrecvc->tidflow.flags = 0; -#endif tidrecvc->tidflow_nswap_gen = 0; tidrecvc->tidflow_genseq.psn_gen = tidrecvc->tidflow_active_gen; tidrecvc->tidflow_genseq.psn_seq = 0; /* Always start sequence number at 0 (zero), in order to prevent wraparound sequence numbers */ -#ifdef PSM_OPA - psmi_hal_tidflow_set_entry( - tidrecvc->rdescid._desc_idx, - tidrecvc->tidflow_genseq.psn_gen, - tidrecvc->tidflow_genseq.psn_seq, - tidrecvc->context->psm_hw_ctxt); -#endif tidrecvc->tid_list.tsess_srcoff = getreq->tidgr_offset; tidrecvc->tid_list.tsess_length = tidrecvc->recv_msglen; @@ -3273,9 +2173,6 @@ ips_tid_recv_alloc(struct ips_protoexp *protoexp, tidrecvc->tid_list.tsess_raddr = tidrecvc->mr->iova + ((uintptr_t)tidrecvc->buffer - (uintptr_t)tidrecvc->mr->addr); #endif -#ifdef PSM_OPA - tidrecvc->ctrl_msg_queued = 0; -#endif tidrecvc->state = TIDRECVC_STATE_BUSY; tidrecvc->stats.nSeqErr = 0; @@ -3283,27 +2180,10 @@ ips_tid_recv_alloc(struct ips_protoexp *protoexp, tidrecvc->stats.nReXmit = 0; tidrecvc->stats.nErrChkReceived = 0; -#ifdef PSM_OPA - /* This gets sent out as a control message, so we need to force 4-byte IB - * alignment */ - tidrecvc->tsess_tidlist_length = (uint16_t) - PSMI_ALIGNUP((sizeof(ips_tid_session_list) + - (tidrecvc->tid_list.tsess_tidcount * - sizeof(uint32_t))), 4); - - _HFI_EXP("alloc tidrecv=%d, paylen=%d, ntid=%d\n", - tidrecvc->rdescid._desc_idx, - tidrecvc->tsess_tidlist_length, - tidrecvc->tid_list.tsess_tidcount); -#else _HFI_EXP("alloc tidrecv=%d\n", tidrecvc->rdescid._desc_idx); -#endif tidrecvc->grantscb = grantscb; -#ifdef PSM_OPA - tidrecvc->completescb = completescb; -#endif *ptidrecvc = tidrecvc; /* return to caller */ PSM2_LOG_MSG("leaving"); @@ -3354,15 +2234,7 @@ ips_tid_pendtids_timer_callback(struct psmi_timer *timer, uint64_t current) #if defined(PSM_CUDA) || defined(PSM_ONEAPI) if ( -#ifdef PSM_OPA - !(((struct ips_protoexp *)timer->context)->proto->flags - & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV) || - ((((struct ips_protoexp *)timer->context)->proto->flags & - IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV) && - gpudirect_rdma_recv_limit < UINT_MAX) -#else 1 /* due to unaligned recv using hostbuf, must always do this */ -#endif ) { /* Before processing pending TID requests, first try to free up * any CUDA host buffers that are now idle. */ @@ -3491,25 +2363,9 @@ ips_tid_pendtids_timer_callback(struct psmi_timer *timer, uint64_t current) _HFI_MMDBG("ips_tid_pendtids_timer_callback: page align nbytes_this %u\n", nbytes_this); psmi_assert(nbytes_this >= 4); -#ifdef PSM_OPA - psmi_assert(nbytes_this <= PSM_TID_WINSIZE); -#endif // for STL native the tids and tidflows available pace incoming TIDs // for UD we still use tidflows available to pace incoming RDMA -#ifdef PSM_OPA - if ((ret = ips_tid_num_available(&protoexp->tidc)) <= 0) { - /* We're out of tids. If this process used all the resource, - * the free callback will reschedule the operation, otherwise, - * we reschedule it here */ - if (ret == 0) - { - psmi_timer_request(protoexp->timerq, - &protoexp->timer_getreqs, - PSMI_TIMER_PRIO_1); - } - } else -#endif if ((ret = ips_tf_available(&protoexp->tfc)) <= 0) { /* We're out of tidflow. If this process used all the resource, * the free callback will reschedule the operation, otherwise, @@ -3611,13 +2467,7 @@ void psmi_cudamemcpy_tid_to_device(struct ips_tid_recv_desc *tidrecvc) struct ips_protoexp *protoexp = tidrecvc->protoexp; struct ips_gpu_hostbuf *chb; const uint32_t transfer_size = -#ifndef PSM_OPA tidrecvc->recv_msglen; -#else - tidrecvc->recv_tidbytes - + tidrecvc->tid_list.tsess_unaligned_start - + tidrecvc->tid_list.tsess_unaligned_end; -#endif chb = tidrecvc->cuda_hostbuf; chb->size += transfer_size; @@ -3638,15 +2488,9 @@ psm2_error_t ips_tid_recv_free(struct ips_tid_recv_desc *tidrecvc) { struct ips_protoexp *protoexp = tidrecvc->protoexp; struct ips_tid_get_request *getreq = tidrecvc->getreq; -#ifdef PSM_OPA - int tidcount = tidrecvc->tid_list.tsess_tidcount; -#endif psm2_error_t err = PSM2_OK; psmi_assert(getreq != NULL); -#ifdef PSM_OPA - psmi_assert(tidcount > 0); -#endif psmi_assert(tidrecvc->state == TIDRECVC_STATE_BUSY); #if defined(PSM_CUDA) || defined(PSM_ONEAPI) @@ -3654,47 +2498,11 @@ psm2_error_t ips_tid_recv_free(struct ips_tid_recv_desc *tidrecvc) psmi_cudamemcpy_tid_to_device(tidrecvc); #endif -#ifndef PSM_OPA if (tidrecvc->mr) { _HFI_MMDBG("CTS recv chunk complete, releasing MR: rkey: 0x%x\n", tidrecvc->mr->rkey); psm3_verbs_release_mr(tidrecvc->mr); tidrecvc->mr = NULL; } -#elif defined(PSM_OPA) - if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_TID_DEBUG) { - int tid, i; - - for (i = 0; i < tidcount; i++) { - tid = - IPS_TIDINFO_GET_TID(tidrecvc->tid_list. - tsess_list[i]) * 2 + - IPS_TIDINFO_GET_TIDCTRL(tidrecvc->tid_list. - tsess_list[i]) - 1; - psmi_assert(protoexp->tid_info[tid].state == - TIDSTATE_USED); - psmi_assert(protoexp->tid_info[tid].tidrecvc == - tidrecvc); - psmi_assert(protoexp->tid_info[tid].tid == - tidrecvc->tid_list.tsess_list[i]); - protoexp->tid_info[tid].state = TIDSTATE_FREE; - protoexp->tid_info[tid].tidrecvc = NULL; - protoexp->tid_info[tid].tid = 0xFFFFFFFF; - } - } - - ips_dump_tids(&tidrecvc->tid_list, "Deregistered %d tids: ", - tidrecvc->tid_list.tsess_tidcount); - - if (protoexp->tidc.tid_array) { - if ((err = ips_tidcache_release(&protoexp->tidc, - tidrecvc->tid_list.tsess_list, tidcount))) - goto fail; - } else { - if ((err = ips_tid_release(&protoexp->tidc, - tidrecvc->tid_list.tsess_list, tidcount))) - goto fail; - } -#endif getreq->tidgr_bytesdone += tidrecvc->recv_msglen; @@ -3735,159 +2543,11 @@ psm2_error_t ips_tid_recv_free(struct ips_tid_recv_desc *tidrecvc) /* we freed some an MR If we have pending sends or pending get requests, * turn on the timer so it can be processed. */ ips_tid_mravail_callback(protoexp->proto); -#elif defined(PSM_OPA) - if (!STAILQ_EMPTY(&protoexp->pend_getreqsq)) { - psmi_timer_request(protoexp->timerq, - &protoexp->timer_getreqs, - PSMI_TIMER_PRIO_1); - } #endif -#ifdef PSM_OPA -fail: -#endif return err; } #endif // PSM_HAVE_RDMA -#ifdef PSM_OPA -// This advancaes the generation for our tidflow -psm2_error_t -ips_protoexp_flow_newgen(struct ips_tid_recv_desc *tidrecvc) -{ - psmi_assert_always(tidrecvc->state == TIDRECVC_STATE_BUSY); - ips_tfgen_allocate(&tidrecvc->protoexp->tfc, - tidrecvc->rdescid._desc_idx, - &tidrecvc->tidflow_active_gen); - /* Update tidflow table with new generation number */ - tidrecvc->tidflow_genseq.psn_gen = tidrecvc->tidflow_active_gen; - psmi_hal_tidflow_set_entry( - tidrecvc->rdescid._desc_idx, - tidrecvc->tidflow_genseq.psn_gen, - tidrecvc->tidflow_genseq.psn_seq, - tidrecvc->context->psm_hw_ctxt); - /* Increment swapped generation count for tidflow */ - tidrecvc->tidflow_nswap_gen++; - return PSM2_OK; -} -#endif // PSM_OPA - -#ifdef PSM_OPA -void ips_protoexp_do_tf_seqerr(void *vpprotoexp - /* actually: struct ips_protoexp *protoexp */, - void *vptidrecvc - /* actually: struct ips_tid_recv_desc *tidrecvc */, - struct ips_message_header *p_hdr) -{ - struct ips_protoexp *protoexp = (struct ips_protoexp *) vpprotoexp; - struct ips_tid_recv_desc *tidrecvc = (struct ips_tid_recv_desc *) vptidrecvc; - psmi_seqnum_t sequence_num, tf_sequence_num; - ips_scb_t ctrlscb; - /* Update stats for sequence errors */ - tidrecvc->stats.nSeqErr++; - - sequence_num.psn_val = __be32_to_cpu(p_hdr->bth[2]); - - /* Only care about sequence error for currently active generation */ - if (tidrecvc->tidflow_active_gen != sequence_num.psn_gen) - return; - - /* If a "large" number of swapped generation we are loosing packets - * for this flow. Request throttling of tidflow by generating a - * BECN. With header suppression we will miss some FECN packet - * on OPA hence keeping track of swapped generation is another - * mechanism to do congestion control for tidflows. - * - * For mismatched sender/receiver/link speeds we can get into a - * deadly embrace where minimal progress is made due to generation - * mismatch errors. This can occur if we wrap around the generation - * count without making progress. Hence in cases where the swapped - * generation count is > 254 stop sending BECN (and the NAK) so the - * send -> receiver pipeline is flushed with an error check and things - * can sync up. This should be an extremely rare event. - */ - - if_pf(tidrecvc->tidflow_nswap_gen >= 254) - return; /* Do not send NAK. Let error check kick in. */ - - if_pf((tidrecvc->tidflow_nswap_gen > 4) && - (protoexp->proto->flags & IPS_PROTO_FLAG_CCA)) { - _HFI_CCADBG("Generating BECN. Number of swapped gen: %d.\n", - tidrecvc->tidflow_nswap_gen); - /* Mark flow to generate BECN in control packet */ - tidrecvc->tidflow.flags |= IPS_FLOW_FLAG_GEN_BECN; - - /* Update stats for congestion encountered */ - protoexp->proto->epaddr_stats.congestion_pkts++; - } - - /* Get the latest seq from hardware tidflow table, if that value is - * reliable. The value is not reliable if context sharing is used, - * because context sharing might drop packet even though hardware - * has received it successfully. The hardware table may also be - * incorrect if RSM is intercepting TID & FECN & SH packets. - * We can handle this condition by taking the most recent PSN whether - * it comes from the tidflow table or from PSM's own accounting. - */ - if (!tidrecvc->context->tf_ctrl) { - uint64_t tf; - uint32_t seqno=0; - - psmi_hal_tidflow_get(tidrecvc->rdescid._desc_idx, &tf, - tidrecvc->context->psm_hw_ctxt); - psmi_hal_tidflow_get_seqnum(tf, &seqno); - tf_sequence_num.psn_val = seqno; - - if (psmi_hal_has_cap(PSM_HAL_CAP_RSM_FECN_SUPP)) { - if (tf_sequence_num.psn_val > tidrecvc->tidflow_genseq.psn_seq) - tidrecvc->tidflow_genseq.psn_seq = tf_sequence_num.psn_seq; - } - else - tidrecvc->tidflow_genseq.psn_seq = tf_sequence_num.psn_seq; - } - - /* Swap generation for the flow. */ - ips_protoexp_flow_newgen(tidrecvc); - - ctrlscb.scb_flags = 0; - ctrlscb.ips_lrh.data[0] = p_hdr->exp_sdescid; - /* Keep peer generation but use my last received sequence */ - sequence_num.psn_seq = tidrecvc->tidflow_genseq.psn_seq; - ctrlscb.ips_lrh.ack_seq_num = sequence_num.psn_val; - - /* My new generation and last received sequence */ - ctrlscb.ips_lrh.data[1].u32w0 = tidrecvc->tidflow_genseq.psn_val; - - // no payload, pass cksum so non-NULL - psm3_ips_proto_send_ctrl_message(&tidrecvc->tidflow, - OPCODE_NAK, - &tidrecvc->ctrl_msg_queued, - &ctrlscb, ctrlscb.cksum, 0); - - /* Update stats for retransmit */ - tidrecvc->stats.nReXmit++; - - return; -} -#endif // PSM_OPA - -#ifdef PSM_OPA -void ips_protoexp_do_tf_generr(void *vpprotoexp - /* actually: struct ips_protoexp *protoexp */, - void *vptidrecvc - /* actually: struct ips_tid_recv_desc *tidrecvc */, - struct ips_message_header *p_hdr) -{ - struct ips_tid_recv_desc *tidrecvc = (struct ips_tid_recv_desc *) vptidrecvc; - /* Update stats for generation errors */ - tidrecvc->stats.nGenErr++; - - /* If packet faced congestion we may want to generate - * a CN packet to rate control sender. - */ - - return; -} -#endif // PSM_OPA diff --git a/psm3/ptl_ips/ips_proto_header.h b/psm3/ptl_ips/ips_proto_header.h index 0d0a5bf..aa0e84c 100644 --- a/psm3/ptl_ips/ips_proto_header.h +++ b/psm3/ptl_ips/ips_proto_header.h @@ -146,17 +146,6 @@ struct ips_message_header { ptl_arg_t hdr_data; } PACK_SUFFIX; -#ifdef PSM_OPA - /* for expected tid packet only */ - struct { - __u8 exp_ustart[3]; /* unaligned start bytes */ - __u8 exp_uend[3]; /* unaligned end bytes */ - __u16 exp_rdescid_genc; /* tidrecvc gen count */ - ptl_arg_t exp_sdescid; /* sender descriptor id */ - __u32 exp_cksum; /* optional checksum */ - __u32 exp_offset; /* packet offset */ - } PACK_SUFFIX; -#endif }; } PACK_SUFFIX; /* desc_genc is up to 32 bits, but EXPTID header (and RDMA immediate data) @@ -177,13 +166,8 @@ struct ips_message_header { #define OPCODE_LONG_RTS 0xC4 /* ready to send */ #define OPCODE_LONG_CTS 0xC5 /* confirm to send */ #define OPCODE_LONG_DATA 0xC6 /* long data packets */ -#ifdef PSM_OPA -#define OPCODE_EXPTID 0xC7 /* expected tid data */ -#define OPCODE_EXPTID_COMPLETION 0xC8 /* expected tid completion */ -#else #define OPCODE_ERR_CHK_RDMA 0xC7 /* RDMA error recovery */ #define OPCODE_ERR_CHK_RDMA_RESP 0xC8 /* RDMA error recovery response */ -#endif /* ACK to ERR_CHK_GEN are "level 0 control packets" state machine driven send */ /* reissue if given state persists */ /* duplicates can occur with no consequences */ @@ -191,11 +175,7 @@ struct ips_message_header { #define OPCODE_NAK 0xCA /* explicit NAK packet */ #define OPCODE_BECN 0xCB /* congestion control */ #define OPCODE_ERR_CHK 0xCC /* query eager receiving */ -#ifdef PSM_OPA -#define OPCODE_ERR_CHK_GEN 0xCD /* query tid receiving */ -#else // 0xCD /* reserved */ -#endif /* CONNECT_REQUEST to DISCONNECT_REPLY are "level 1 control packets" */ /* timer based resend, but rebuild on fly when resend */ /* consumer must deal with duplicates */ diff --git a/psm3/ptl_ips/ips_proto_help.h b/psm3/ptl_ips/ips_proto_help.h index 3beb9ee..4a3a680 100644 --- a/psm3/ptl_ips/ips_proto_help.h +++ b/psm3/ptl_ips/ips_proto_help.h @@ -147,15 +147,6 @@ ips_do_cksum(struct ips_proto *proto, struct ips_message_header *p_hdr, return 0; } -#ifdef PSM_OPA -PSMI_ALWAYS_INLINE( -uint32_t -ips_proto_dest_context_from_header(struct ips_proto *proto, - struct ips_message_header *p_hdr)) -{ - return (__be32_to_cpu(p_hdr->bth[1]) & 0xFF); -} -#endif PSMI_ALWAYS_INLINE( void @@ -188,13 +179,6 @@ ips_proto_hdr(struct ips_proto *proto, struct ips_epaddr *ipsaddr, (scb-> offset_mode << HFI_KHDR_OM_SHIFT) -#ifdef PSM_OPA - | (scb-> tid << - HFI_KHDR_TID_SHIFT) - | (scb-> - tidctrl << - HFI_KHDR_TIDCTRL_SHIFT) | -#endif (scb-> flags & IPS_SEND_FLAG_INTR) | (scb-> @@ -222,10 +206,6 @@ ips_proto_hdr(struct ips_proto *proto, struct ips_epaddr *ipsaddr, p_hdr->lrh[0] = __cpu_to_be16(HFI_LRH_BTH | ((flow->path->pr_sl & HFI_LRH_SL_MASK) << HFI_LRH_SL_SHIFT) -#ifdef PSM_OPA - | ((proto->sl2sc[flow->path->pr_sl] & - HFI_LRH_SC_MASK) << HFI_LRH_SC_SHIFT) -#endif ); p_hdr->lrh[1] = dlid; p_hdr->lrh[2] = lrh2_be; @@ -237,44 +217,9 @@ ips_proto_hdr(struct ips_proto *proto, struct ips_epaddr *ipsaddr, p_hdr->bth[2] = __cpu_to_be32(flow->xmit_seq_num.psn_num | (scb->scb_flags & IPS_SEND_FLAG_ACKREQ)); -#ifdef PSM_OPA - if (scb->tidctrl) { /* expected receive packet */ - psmi_assert(scb->tidsendc != NULL); - p_hdr->bth[1] = __cpu_to_be32(ipsaddr->opa.context | - (ipsaddr->opa.subcontext << - HFI_BTH_SUBCTXT_SHIFT) | - (scb->tidsendc-> - rdescid._desc_idx - << HFI_BTH_FLOWID_SHIFT) - | (proto->epinfo. - ep_baseqp << - HFI_BTH_QP_SHIFT)); - - /* Setup KHDR fields */ - p_hdr->khdr.kdeth0 = __cpu_to_le32(p_hdr->khdr.kdeth0 | - (scb->tidctrl << - HFI_KHDR_TIDCTRL_SHIFT) | - (scb->scb_flags & - IPS_SEND_FLAG_INTR) - | (scb->scb_flags & - IPS_SEND_FLAG_HDRSUPP) - | (IPS_PROTO_VERSION << - HFI_KHDR_KVER_SHIFT)); - } else { /* eager receive packet */ - p_hdr->bth[1] = __cpu_to_be32(ipsaddr->opa.context | - (ipsaddr-> - opa.subcontext << - HFI_BTH_SUBCTXT_SHIFT) | - (flow->flowid - << HFI_BTH_FLOWID_SHIFT) - | (proto->epinfo. - ep_baseqp << - HFI_BTH_QP_SHIFT)); -#else { p_hdr->bth[1] = __cpu_to_be32((flow->flowid << HFI_BTH_FLOWID_SHIFT)); -#endif // PSM_OPA /* Setup KHDR fields */ p_hdr->khdr.kdeth0 = __cpu_to_le32(p_hdr->khdr.kdeth0 | (scb->scb_flags & @@ -285,11 +230,7 @@ ips_proto_hdr(struct ips_proto *proto, struct ips_epaddr *ipsaddr, p_hdr->ack_seq_num = flow->recv_seq_num.psn_num; } -#ifndef PSM_OPA p_hdr->khdr.job_key = 0; -#else - p_hdr->khdr.job_key = __cpu_to_le32(proto->epinfo.ep_jkey); -#endif p_hdr->connidx = ipsaddr->connidx_outgoing; p_hdr->flags = flags; @@ -309,14 +250,10 @@ void ips_scb_prepare_flow_inner(struct ips_proto *proto, struct ips_epaddr *ipsaddr, struct ips_flow *flow, ips_scb_t *scb)) { -#ifdef PSM_OPA - psmi_assert((scb->payload_size & 3) == 0); -#else // On UD and UDP, ips_ptl_mq_rndv can allow small odd sized payload // in RTS and eager can do odd length send psmi_assert(psmi_hal_has_cap(PSM_HAL_CAP_NON_DW_PKT_SIZE) || ((scb->payload_size & 3) == 0)); -#endif ips_proto_hdr(proto, ipsaddr, flow, scb, ips_flow_gen_ackflags(scb, flow)); @@ -346,9 +283,6 @@ ips_proto_epaddr_stats_set(struct ips_proto *proto, uint8_t msgtype)) case OPCODE_ACK: break; case OPCODE_ERR_CHK: -#ifdef PSM_OPA - case OPCODE_ERR_CHK_GEN: -#endif proto->epaddr_stats.err_chk_send++; break; case OPCODE_NAK: @@ -446,25 +380,8 @@ ips_proto_is_expected_or_nak(struct ips_recvhdrq_event *rcv_ev)) struct ips_flow *flow; psmi_seqnum_t sequence_num; -#ifdef PSM_OPA - psmi_assert((flowid == EP_FLOW_GO_BACK_N_PIO) || - (flowid == EP_FLOW_GO_BACK_N_DMA) - ); -#else psmi_assert(flowid == EP_FLOW_GO_BACK_N_PIO); -#endif flow = &ipsaddr->flows[flowid]; -#ifdef PSM_OPA - /* If packet faced congestion generate BECN in NAK. */ - if_pf((rcv_ev->is_congested & IPS_RECV_EVENT_FECN) && - ((flow->cca_ooo_pkts & 0xf) == 0)) { - /* Generate a BECN for every 16th OOO packet marked with a FECN. */ - flow->flags |= IPS_FLOW_FLAG_GEN_BECN; - flow->cca_ooo_pkts++; - rcv_ev->proto->epaddr_stats.congestion_pkts++; - rcv_ev->is_congested &= ~IPS_RECV_EVENT_FECN; /* Clear FECN event */ - } -#endif sequence_num.psn_val = __be32_to_cpu(p_hdr->bth[2]); if_pf(flow->recv_seq_num.psn_num == sequence_num.psn_num) { @@ -472,9 +389,6 @@ ips_proto_is_expected_or_nak(struct ips_recvhdrq_event *rcv_ev)) flow->recv_seq_num.psn_num = (flow->recv_seq_num.psn_num + 1) & proto->psn_mask; -#ifdef PSM_OPA - flow->cca_ooo_pkts = 0; -#endif /* don't process ack, caller will do it. */ return 1; @@ -489,36 +403,6 @@ ips_proto_is_expected_or_nak(struct ips_recvhdrq_event *rcv_ev)) ips_proto_send_nak((struct ips_recvhdrq *) rcv_ev->recvq, flow); flow->flags |= IPS_FLOW_FLAG_NAK_SEND; -#ifdef PSM_OPA - flow->cca_ooo_pkts = 0; - } else if (proto->flags & IPS_PROTO_FLAG_CCA) { - flow->cca_ooo_pkts = diff; - // for OPA, ack_interval_bytes >= ack_interval*mtu - // so only need to check ack_interval here - if (flow->cca_ooo_pkts > flow->ack_interval) { - ips_scb_t ctrlscb; - - rcv_ev->proto->epaddr_stats.congestion_pkts++; - flow->flags |= IPS_FLOW_FLAG_GEN_BECN; - _HFI_CCADBG - ("BECN Generation. Expected: %d, Got: %d.\n", - flow->recv_seq_num.psn_num, - sequence_num.psn_num); - - ctrlscb.scb_flags = 0; - ctrlscb.ips_lrh.data[0].u32w0 = - flow->cca_ooo_pkts; - /* Send Control message to throttle flow. Will clear flow flag and - * reset cca_ooo_pkts. - */ - // no payload, pass cksum so non-NULL - psm3_ips_proto_send_ctrl_message(flow, - OPCODE_BECN, - &flow->ipsaddr-> - ctrl_msg_queued, - &ctrlscb, ctrlscb.cksum, 0); - } -#endif // PSM_OPA } } @@ -586,24 +470,6 @@ ips_proto_process_packet(const struct ips_recvhdrq_event *rcv_ev, uint32_t index; #ifdef PSM_FI -#ifdef PSM_OPA - /* NOTE: Fault injection will currently not work with hardware - * suppression. See note below for reason why as we currently - * do not update the hardware tidflow table if FI is dropping - * the packet. - * - * We need to look into the packet before dropping it and - * if it's an expected packet AND we have hardware suppression - * then we need to update the hardware tidflow table and the - * associated tidrecvc state to fake having received a packet - * until some point in the window defined by the loss rate. - * This way the subsequent err chk will be NAKd and we can resync - * the flow with the sender. - * - * Note: For real errors the hardware generates seq/gen errors - * which are handled appropriately by the protocol. - */ -#endif if_pf(PSM3_FAULTINJ_ENABLED_EP(rcv_ev->proto->ep)) { PSM3_FAULTINJ_STATIC_DECL(fi_recv, "recvlost", diff --git a/psm3/ptl_ips/ips_proto_internal.h b/psm3/ptl_ips/ips_proto_internal.h index 7146a89..e4bb08b 100644 --- a/psm3/ptl_ips/ips_proto_internal.h +++ b/psm3/ptl_ips/ips_proto_internal.h @@ -94,12 +94,6 @@ psm2_error_t psm3_ips_proto_timer_ack_callback(struct psmi_timer *, uint64_t); psm2_error_t psm3_ips_proto_timer_send_callback(struct psmi_timer *, uint64_t); psm2_error_t psm3_ips_proto_timer_ctrlq_callback(struct psmi_timer *, uint64_t); psm2_error_t psm3_ips_proto_timer_pendq_callback(struct psmi_timer *, uint64_t); -#ifdef PSM_OPA -psm2_error_t ips_cca_timer_callback(struct psmi_timer *current_timer, - uint64_t current); - -psm2_error_t ips_cca_adjust_rate(ips_path_rec_t *path_rec, int cct_increment); -#endif void psm3_ips_proto_rv_scbavail_callback(struct ips_scbctrl *scbc, void *context); psm2_error_t psm3_ips_proto_recv_init(struct ips_proto *proto); diff --git a/psm3/ptl_ips/ips_proto_mq.c b/psm3/ptl_ips/ips_proto_mq.c index a7a423a..f911663 100644 --- a/psm3/ptl_ips/ips_proto_mq.c +++ b/psm3/ptl_ips/ips_proto_mq.c @@ -307,36 +307,15 @@ ips_ptl_mq_eager(struct ips_proto *proto, psm2_mq_req_t req, uint16_t msgseq; ips_scb_t *scb; uint16_t padding = 0; // padding for 1st in sequence -#if defined(PSM_OPA) -#ifdef PSM_DEBUG - uint32_t is_non_dw_mul_allowed = 0; // only for debug asserts -#endif -#endif uint32_t frag_size = flow->frag_size; psmi_assert(len > 0); psmi_assert(req != NULL); -#ifdef PSM_OPA - if (flow->transfer == PSM_TRANSFER_DMA) { - psmi_assert((proto->flags & IPS_PROTO_FLAG_SPIO) == 0); - /* max chunk size is the rv window size */ - chunk_size = ipsaddr->opa.window_rv; -#ifdef PSM_DEBUG - if (psmi_hal_has_cap(PSM_HAL_CAP_NON_DW_MULTIPLE_MSG_SIZE)) - is_non_dw_mul_allowed = 1; -#endif - } else { - psmi_assert((proto->flags & IPS_PROTO_FLAG_SDMA) == 0); - padding = len & 0x3; // will pad 1st in sequence - chunk_size = frag_size; - } -#else if (! psmi_hal_has_cap(PSM_HAL_CAP_NON_DW_PKT_SIZE)) padding = len & 0x3; // will pad 1st in sequence chunk_size = min(proto->ep->chunk_max_segs*frag_size, proto->ep->chunk_max_size); -#endif msgseq = ipsaddr->msgctl->mq_send_seqnum++; nbytes_left = len; @@ -356,9 +335,6 @@ ips_ptl_mq_eager(struct ips_proto *proto, psm2_mq_req_t req, pktlen = frag_size - padding; } else { pktlen = min(chunk_size, nbytes_left); -#ifdef PSM_OPA - psmi_assert(!(pktlen & 0x3) || is_non_dw_mul_allowed); -#endif } scb = mq_alloc_pkts(proto, 1, 0, 0); @@ -383,16 +359,6 @@ ips_ptl_mq_eager(struct ips_proto *proto, psm2_mq_req_t req, #if defined(PSM_CUDA) || defined(PSM_ONEAPI) if (req->is_buf_gpu_mem) { -#ifdef PSM_OPA - /* PSM would never send packets using eager protocol - * if GPU Direct RDMA is turned off, which makes setting - * these flags safe. - */ - // assume SDMA which will pin as part of SDMA request - // if not, flags will get handled in pio transfer_frame - // but use cuMemcpy instead of GDRCopy - ips_scb_flags(scb) |= IPS_SEND_FLAG_PAYLOAD_BUF_GPU; -#else // flags will get handled in pio transfer_frame // but use cuMemcpy instead of GDRCopy #ifdef PSM_HAVE_REG_MR @@ -401,7 +367,6 @@ ips_ptl_mq_eager(struct ips_proto *proto, psm2_mq_req_t req, #else ips_scb_flags(scb) |= IPS_SEND_FLAG_PAYLOAD_BUF_GPU; #endif -#endif // PSM_OPA // TBD USER_BUF_GPU only useful for RTS ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU; } @@ -412,9 +377,6 @@ ips_ptl_mq_eager(struct ips_proto *proto, psm2_mq_req_t req, nbytes_left -= pktlen; pktlen += padding; -#ifdef PSM_OPA - psmi_assert(!(pktlen & 0x3) || is_non_dw_mul_allowed); -#endif padding = 0; // rest of packets don't need padding scb->frag_size = frag_size; @@ -450,14 +412,6 @@ ips_ptl_mq_eager(struct ips_proto *proto, psm2_mq_req_t req, } while (nbytes_left); -#ifdef PSM_OPA - /* after all sdma setup, flush sdma queue, - * we want one system call to handle as many scbs as possible. - */ - if (flow->transfer == PSM_TRANSFER_DMA) { - err = flow->flush(flow, NULL); - } -#endif /* Before return, try to make some progress as long as the operation is * not a fast path isend. If this is a fast path isend we cannot call @@ -513,11 +467,7 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req, #if defined(PSM_CUDA) || defined(PSM_ONEAPI) !req->is_buf_gpu_mem && #endif -#ifdef PSM_OPA - !(len & 0x3)) { // must be well aligned -#else (psmi_hal_has_cap(PSM_HAL_CAP_NON_DW_PKT_SIZE) || !(len & 0x3))) { -#endif ips_scb_buffer(scb) = (void *)buf; scb->chunk_size = ips_scb_length(scb) = len; req->send_msgoff = len; @@ -560,11 +510,7 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req, chb = NULL; window_len = ips_cuda_next_window( -#ifdef PSM_OPA - ipsaddr->opa.window_rv, -#else proto->mq->hfi_base_window_rv, -#endif offset, len); unsigned bufsz; @@ -681,11 +627,6 @@ static inline int psm3_is_needed_rendezvous(struct ips_proto *proto, uint32_t len) { if ( -#ifdef PSM_OPA - !(proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND) || - !PSMI_IS_GDR_COPY_ENABLED || - len < 1 || -#endif len > cuda_thresh_rndv){ return 1; } @@ -694,46 +635,6 @@ int psm3_is_needed_rendezvous(struct ips_proto *proto, uint32_t len) } #endif //PSM_CUDA || PSM_ONEAPI -#ifdef PSM_OPA -/* Find the correct flow (PIO/DMA) */ -static inline -ips_epaddr_flow_t -flow_select_type(struct ips_proto *proto, uint32_t len, int gpu_mem, - uint32_t eager_thresh) -{ - ips_epaddr_flow_t flow_type; - uint32_t pio_gdr_threshold; - -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - if (gpu_mem) { - pio_gdr_threshold = gdr_copy_limit_send; - } else -#endif - { - pio_gdr_threshold = eager_thresh; - } - - if (len <= pio_gdr_threshold) { /* PIO or GDRcopy */ - flow_type = EP_FLOW_GO_BACK_N_PIO; - /* - * If PIO was disabled through the environment variable, - * override the flow value. - */ - if (unlikely(ips_proto_is_disabled_pio(proto))) - flow_type = EP_FLOW_GO_BACK_N_DMA; - } else { /* Send DMA */ - flow_type = EP_FLOW_GO_BACK_N_DMA; - /* - * If Send DMA was disabled through the environment variable, - * override the flow value. - */ - if (unlikely(ips_proto_is_disabled_sdma(proto))) - flow_type = EP_FLOW_GO_BACK_N_PIO; - } - - return flow_type; -} -#endif // PSM_OPA psm2_error_t psm3_ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user, @@ -741,9 +642,6 @@ psm3_ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user uint32_t len, void *context, psm2_mq_req_t *req_o) { psm2_error_t err = PSM2_OK; -#ifdef PSM_OPA - ips_epaddr_flow_t flow_type; -#endif struct ips_proto *proto; struct ips_flow *flow; ips_epaddr_t *ipsaddr; @@ -786,18 +684,7 @@ psm3_ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user goto do_rendezvous; } #endif -#ifdef PSM_OPA -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - flow_type = flow_select_type(proto, len, req->is_buf_gpu_mem, - proto->iovec_thresh_eager); -#else - flow_type = flow_select_type(proto, len, 0, - proto->iovec_thresh_eager); -#endif - flow = &ipsaddr->flows[flow_type]; -#else flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO]; -#endif if (flags_user & PSM2_MQ_FLAG_SENDSYNC) { goto do_rendezvous; @@ -891,12 +778,10 @@ psm3_ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user if (len <= gdr_copy_limit_send && NULL != (user_buffer = psmi_hal_gdr_convert_gpu_to_host_addr( (unsigned long)ubuf, len , 0, proto->ep))) { -#ifndef PSM_OPA /* init req so ips_proto_mq_eager_complete can unmap */ req->req_data.buf = (uint8_t*)ubuf; req->req_data.buf_len = len; req->req_data.send_msglen = len; -#endif proto->strat_stats.short_gdrcopy_isend++; proto->strat_stats.short_gdrcopy_isend_bytes += len; } else { @@ -1113,17 +998,12 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags, psm2_mq_tag_t *tag, const void *ubuf, uint32_t len) { psm2_error_t err = PSM2_OK; -#ifdef PSM_OPA - ips_epaddr_flow_t flow_type; -#endif struct ips_proto *proto; struct ips_flow *flow; ips_epaddr_t *ipsaddr; ips_scb_t *scb; -#if defined(PSM_OPA) - int gpu_mem = 0; -#elif defined(PSM_CUDA) || defined (PSM_ONEAPI) +#if defined(PSM_CUDA) || defined (PSM_ONEAPI) int gpu_mem = 0; #endif @@ -1149,13 +1029,7 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags, goto do_rendezvous; } #endif -#ifdef PSM_OPA - flow_type = flow_select_type(proto, len, gpu_mem, - proto->iovec_thresh_eager_blocking); - flow = &ipsaddr->flows[flow_type]; -#else flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO]; -#endif if (flags & PSM2_MQ_FLAG_SENDSYNC) { goto do_rendezvous; @@ -1319,9 +1193,7 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags, * same user_buffer from two IOs here. */ if (ips_scb_buffer(scb) == (void *)user_buffer) { -#ifdef PSM_OPA - if (flow->transfer != PSM_TRANSFER_PIO || -#elif defined(PSM_HAVE_REG_MR) +#if defined(PSM_HAVE_REG_MR) if ((ips_scb_flags(scb) & IPS_SEND_FLAG_SEND_MR) || #else if ( @@ -1708,9 +1580,7 @@ psm3_ips_proto_mq_push_rts_data(struct ips_proto *proto, psm2_mq_req_t req) uint32_t nbytes_this, chunk_size; uint32_t frag_size, unaligned_bytes; #if defined(PSM_CUDA) || defined(PSM_ONEAPI) -#ifndef PSM_OPA int converted = 0; -#endif #endif struct ips_flow *flow; ips_scb_t *scb; @@ -1719,43 +1589,12 @@ psm3_ips_proto_mq_push_rts_data(struct ips_proto *proto, psm2_mq_req_t req) psmi_assert(nbytes_left > 0); PSM2_LOG_MSG("entering."); -#ifdef PSM_OPA - if ( -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - (req->is_buf_gpu_mem && (proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND)) || -#endif - req->req_data.send_msglen > proto->iovec_thresh_eager) { - /* use SDMA transfer */ - psmi_assert((proto->flags & IPS_PROTO_FLAG_SPIO) == 0); - flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_DMA]; - frag_size = flow->frag_size; - /* max chunk size is the rv window size */ - chunk_size = ipsaddr->opa.window_rv; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - if (req->is_buf_gpu_mem) { - proto->strat_stats.rndv_long_gdr_send += dostats; - proto->strat_stats.rndv_long_gdr_send_bytes += dostats*req->req_data.send_msglen; - } else { -#endif - proto->strat_stats.rndv_long_dma_cpu_send += dostats; - proto->strat_stats.rndv_long_dma_cpu_send_bytes += dostats*req->req_data.send_msglen; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - } -#endif - } else { - psmi_assert((proto->flags & IPS_PROTO_FLAG_SDMA) == 0); -#else /* PSM_OPA */ { -#endif /* PSM_OPA */ /* use PIO transfer */ flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO]; -#ifdef PSM_OPA - chunk_size = frag_size = flow->frag_size; -#else frag_size = flow->frag_size; chunk_size = min(proto->ep->chunk_max_segs*frag_size, proto->ep->chunk_max_size); -#endif #if defined(PSM_CUDA) || defined(PSM_ONEAPI) if (req->is_buf_gpu_mem) { #ifdef PSM_HAVE_REG_MR @@ -1775,7 +1614,6 @@ psm3_ips_proto_mq_push_rts_data(struct ips_proto *proto, psm2_mq_req_t req) proto->strat_stats.rndv_long_gdr_send_bytes += dostats*req->req_data.send_msglen; } else #endif -#ifndef PSM_OPA #if defined(PSM_CUDA) || defined(PSM_ONEAPI) // for GPU send buffer <= 3, receiver can select // LONG DATA and we can use GDRCopy @@ -1789,9 +1627,6 @@ psm3_ips_proto_mq_push_rts_data(struct ips_proto *proto, psm2_mq_req_t req) proto->strat_stats.rndv_long_gdrcopy_send_bytes += dostats*req->req_data.send_msglen; } else { buf = (uintptr_t) req->req_data.buf + req->recv_msgoff; -#else - { -#endif #else { #endif @@ -1855,9 +1690,7 @@ psm3_ips_proto_mq_push_rts_data(struct ips_proto *proto, psm2_mq_req_t req) if (unaligned_bytes) { #if defined(PSM_CUDA) || defined(PSM_ONEAPI) if (!req->is_buf_gpu_mem -#ifndef PSM_OPA || converted -#endif ) mq_copy_tiny_host_mem((uint32_t *)&scb->ips_lrh.mdata, (uint32_t *)buf, unaligned_bytes); @@ -1888,13 +1721,10 @@ psm3_ips_proto_mq_push_rts_data(struct ips_proto *proto, psm2_mq_req_t req) #if defined(PSM_CUDA) || defined(PSM_ONEAPI) // SDMA identifies GPU buffers itself. But PIO path needs flags if (req->is_buf_gpu_mem -#ifdef PSM_OPA - && ! (proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND) -#endif ) { #ifdef PSM_HAVE_REG_MR if (!req->mr && !converted) -#elif ! defined(PSM_OPA) +#else if (!converted) #endif ips_scb_flags(scb) |= IPS_SEND_FLAG_PAYLOAD_BUF_GPU; @@ -1945,12 +1775,6 @@ psm3_ips_proto_mq_push_rts_data(struct ips_proto *proto, psm2_mq_req_t req) } while (nbytes_left); -#ifdef PSM_OPA - /* for sdma, if some bytes are queued, flush them */ - if (flow->transfer == PSM_TRANSFER_DMA && nbytes_sent) { - flow->flush(flow, NULL); - } -#endif PSM2_LOG_MSG("leaving."); @@ -1995,12 +1819,10 @@ psm3_ips_proto_mq_handle_cts(struct ips_recvhdrq_event *rcv_ev) p_hdr->data[1].u32w0); proto->epaddr_stats.cts_rdma_recv++; -#ifndef PSM_OPA /* OPA may use TID for small messages when GDRCopy disabled */ #if defined(PSM_CUDA) || defined(PSM_ONEAPI) psmi_assert(p_hdr->data[1].u32w1 > min(cuda_thresh_rndv, mq->hfi_thresh_rv)); // msglen #else psmi_assert(p_hdr->data[1].u32w1 > mq->hfi_thresh_rv); // msglen -#endif #endif psmi_assert(proto->protoexp != NULL); @@ -2602,9 +2424,7 @@ psm3_ips_proto_mq_handle_data(struct ips_recvhdrq_event *rcv_ev) struct ips_flow *flow; #if defined(PSM_CUDA) || defined(PSM_ONEAPI) -#ifndef PSM_OPA int use_gdrcopy = 0; -#endif struct ips_proto *proto = rcv_ev->proto; #endif // PSM_CUDA || PSM_ONEAPI psmi_copy_tiny_fn_t psmi_copy_tiny_fn = mq_copy_tiny; @@ -2629,7 +2449,6 @@ psm3_ips_proto_mq_handle_data(struct ips_recvhdrq_event *rcv_ev) #if defined(PSM_CUDA) || defined(PSM_ONEAPI) // cpu stats already tracked when sent CTS if (req->is_buf_gpu_mem) { -#ifndef PSM_OPA req->req_data.buf = req->user_gpu_buffer; // 1st packet with any unaligned data we handle here if (p_hdr->data[1].u32w0 < 4) { @@ -2652,9 +2471,6 @@ psm3_ips_proto_mq_handle_data(struct ips_recvhdrq_event *rcv_ev) //proto->strat_stats.rndv_long_gdr_recv++; proto->strat_stats.rndv_long_gdr_recv_bytes += paylen; } else { -#else - { -#endif // PSM_OPA if (p_hdr->data[1].u32w0 < 4) proto->strat_stats.rndv_long_cuCopy_recv++; proto->strat_stats.rndv_long_cuCopy_recv_bytes += paylen; } @@ -2675,11 +2491,7 @@ psm3_ips_proto_mq_handle_data(struct ips_recvhdrq_event *rcv_ev) psm3_mq_handle_data(mq, req, p_hdr->data[1].u32w0, payload, paylen #if defined(PSM_CUDA) || defined(PSM_ONEAPI) -#ifdef PSM_OPA - , 0, NULL); -#else , use_gdrcopy, rcv_ev->proto->ep); -#endif #else ); #endif diff --git a/psm3/ptl_ips/ips_proto_params.h b/psm3/ptl_ips/ips_proto_params.h index 9327a75..3114880 100644 --- a/psm3/ptl_ips/ips_proto_params.h +++ b/psm3/ptl_ips/ips_proto_params.h @@ -125,9 +125,6 @@ #define IPS_FLOW_FLAG_PENDING_ACK 0x02 #define IPS_FLOW_FLAG_PENDING_NAK 0x04 #define IPS_FLOW_FLAG_GEN_BECN 0x08 -#ifdef PSM_OPA -#define IPS_FLOW_FLAG_CONGESTED 0x10 -#endif #define IPS_FLOW_FLAG_SKIP_CTS 0x20 /* tid session expected send flags */ @@ -183,18 +180,11 @@ /* 0x10000000, interrupt when done */ #define IPS_SEND_FLAG_INTR (1<protoexp; - ptl_arg_t desc_id = p_hdr->data[0]; - struct ips_tid_send_desc *tidsendc; - ptl_arg_t desc_tidsendc; - struct ips_flow *flow; - uint32_t last_seq; - struct ips_scb_unackedq *unackedq; - - tidsendc = (struct ips_tid_send_desc *) - psm3_mpool_find_obj_by_index(protoexp->tid_desc_send_pool, - desc_id._desc_idx); - if (tidsendc == NULL) { - _HFI_ERROR - ("OPCODE_ACK: Index %d is out of range in tidflow ack\n", - desc_id._desc_idx); - return NULL; - } - - /* Ensure generation matches */ - psm3_mpool_get_obj_index_gen_count(tidsendc, - &desc_tidsendc._desc_idx, - &desc_tidsendc._desc_genc); - if (desc_tidsendc.u64 != desc_id.u64) - return NULL; - - /* Ensure ack is within window */ - flow = &tidsendc->tidflow; - unackedq = &flow->scb_unacked; - - /* No unacked scbs */ - if (STAILQ_EMPTY(unackedq)) - return NULL; - - /* Generation for ack should match */ - if (STAILQ_FIRST(unackedq)->seq_num.psn_gen != ack_seq_num.psn_gen) - return NULL; - - /* scb_pend will be moved back when an nak is received, but - * the packet may actually be received and acked after the nak, - * so we use the tail of unacked queue, which may include packets - * not being sent out yet, this is over do, but it should be OK. */ - last_seq = STAILQ_LAST(unackedq, ips_scb, nextq)->seq_num.psn_seq; - - if (between(flow->xmit_ack_num.psn_seq, - last_seq, ack_seq_num.psn_seq) == 0) - return NULL; - - return flow; -} -#endif // PSM_OPA - -#ifdef PSM_OPA -/* NAK post process for tid flow */ -void ips_tidflow_nak_post_process(struct ips_proto *proto, - struct ips_flow *flow) -{ - ips_scb_t *scb; - uint32_t first_seq, ack_seq; - - scb = STAILQ_FIRST(&flow->scb_unacked); - first_seq = __be32_to_cpu(scb->ips_lrh.bth[2]) & HFI_BTH_SEQ_MASK; - ack_seq = (flow->xmit_ack_num.psn_seq - 1) & HFI_BTH_SEQ_MASK; - - /* If the ack SEQ falls into a multi-packets scb, - * don't re-send the packets already acked. */ - if (scb->nfrag > 1 && - between(first_seq, scb->seq_num.psn_seq, ack_seq)) { - uint32_t om, offset_in_tid, remaining_bytes_in_tid; - uint32_t npkt, pktlen, nbytes; - uint32_t idx, loop; - - /* how many packets acked in this scb */ - npkt = ((ack_seq - first_seq) & HFI_BTH_SEQ_MASK) + 1; - - /* Get offset/om from current packet header */ - offset_in_tid = __le32_to_cpu(scb->ips_lrh.khdr.kdeth0) & - HFI_KHDR_OFFSET_MASK; - om = (__le32_to_cpu(scb->ips_lrh.khdr.kdeth0) >> - HFI_KHDR_OM_SHIFT) & 0x1; - if (om) - offset_in_tid *= 64; - else - offset_in_tid *= 4; - /* bytes remaining in current tid */ - remaining_bytes_in_tid = - (IPS_TIDINFO_GET_LENGTH(scb->tsess[0]) << 12) - - offset_in_tid; - - /* packet length in current header */ - pktlen = scb->payload_size; - psmi_assert(min(remaining_bytes_in_tid, - scb->frag_size) >= pktlen); - psmi_assert((ips_proto_lrh2_be_to_bytes(proto, - scb->ips_lrh.lrh[2]) - - sizeof(struct ips_message_header) - - HFI_CRC_SIZE_IN_BYTES) == pktlen); - - /* Loop to find the position to start */ - idx = 0; - nbytes = 0; - loop = npkt; - while (loop) { - remaining_bytes_in_tid -= pktlen; - offset_in_tid += pktlen; - nbytes += pktlen; - first_seq++; - loop--; - - if (remaining_bytes_in_tid == 0) { - idx++; - remaining_bytes_in_tid = - IPS_TIDINFO_GET_LENGTH(scb-> - tsess[idx]) << 12; - offset_in_tid = 0; - } - - pktlen = min(remaining_bytes_in_tid, scb->frag_size); - } - psmi_assert((first_seq & HFI_BTH_SEQ_MASK) == - ((ack_seq + 1) & HFI_BTH_SEQ_MASK)); - - /* 0. update scb info */ - psmi_assert(scb->nfrag_remaining > npkt); - scb->nfrag_remaining -= npkt; - psmi_assert(scb->chunk_size_remaining > nbytes); - scb->chunk_size_remaining -= nbytes; - ips_scb_buffer(scb) = (void *)((char *)ips_scb_buffer(scb) + nbytes); - - /* 1. if last packet in sequence, set ACK, clear SH */ - if (scb->nfrag_remaining == 1) { - psmi_assert(scb->chunk_size_remaining <= - scb->frag_size); - scb->scb_flags |= IPS_SEND_FLAG_ACKREQ; - scb->scb_flags &= ~IPS_SEND_FLAG_HDRSUPP; - - /* last packet is what remaining */ - pktlen = scb->chunk_size_remaining; - } - - /* 2. set new packet sequence number */ - scb->ips_lrh.bth[2] = __cpu_to_be32( - ((first_seq & HFI_BTH_SEQ_MASK) << HFI_BTH_SEQ_SHIFT) | - ((scb->seq_num.psn_gen & - HFI_BTH_GEN_MASK) << HFI_BTH_GEN_SHIFT) | - (scb->scb_flags & IPS_SEND_FLAG_ACKREQ)); - /* 3. set new packet offset */ - scb->ips_lrh.exp_offset += nbytes; - - /* 4. if packet length is changed, set new length */ - if (scb->payload_size != pktlen) { - scb->payload_size = pktlen; - scb->ips_lrh.lrh[2] = __cpu_to_be16(( - (scb->payload_size + - sizeof(struct ips_message_header) + - HFI_CRC_SIZE_IN_BYTES) >> - BYTE2DWORD_SHIFT) & proto->pktlen_mask); - } - - /* 5. set new tidctrl and tidinfo array */ - scb->tsess = &scb->tsess[idx]; - scb->tsess_length -= idx * sizeof(uint32_t); - scb->tidctrl = IPS_TIDINFO_GET_TIDCTRL(scb->tsess[0]); - - /* 6. calculate new offset mode */ - if (offset_in_tid < 131072) { /* 2^15 * 4 */ - offset_in_tid /= 4; - om = 0; - } else { - offset_in_tid /= 64; - om = 1; - } - - /* 7. set new tidinfo */ - scb->ips_lrh.khdr.kdeth0 = __cpu_to_le32( - (offset_in_tid & HFI_KHDR_OFFSET_MASK) | - (om << HFI_KHDR_OM_SHIFT) | - (IPS_TIDINFO_GET_TID(scb->tsess[0]) - << HFI_KHDR_TID_SHIFT) | - (scb->tidctrl << HFI_KHDR_TIDCTRL_SHIFT) | - (scb->scb_flags & IPS_SEND_FLAG_INTR) | - (scb->scb_flags & IPS_SEND_FLAG_HDRSUPP) | - (IPS_PROTO_VERSION << HFI_KHDR_KVER_SHIFT)); - } - - /* Update unacked scb's to use the new generation */ - while (scb) { - /* update with new generation */ - scb->ips_lrh.bth[2] = __cpu_to_be32( - (__be32_to_cpu(scb->ips_lrh.bth[2]) & - (~(HFI_BTH_GEN_MASK << HFI_BTH_GEN_SHIFT))) | - ((flow->xmit_seq_num.psn_gen & - HFI_BTH_GEN_MASK) << HFI_BTH_GEN_SHIFT)); - scb->seq_num.psn_gen = flow->xmit_seq_num.psn_gen; - scb = SLIST_NEXT(scb, next); - } -} -#endif // PSM_OPA /* NAK post process for any flow where an scb may describe more than 1 packet * (OPA dma flow or GSO PIO flow). In which case we may need to resume in @@ -516,19 +313,11 @@ psm3_ips_proto_process_ack(struct ips_recvhdrq_event *rcv_ev) psmi_seqnum_t ack_seq_num, last_seq_num; ips_epaddr_flow_t flowid; ips_scb_t *scb; -#ifdef PSM_OPA - uint32_t tidctrl; -#endif ack_seq_num.psn_num = p_hdr->ack_seq_num; // check actual psn acked (ack_seq_num-1), we only want to process acks // for packets we never got an ack for -#ifdef PSM_OPA - tidctrl = GET_HFI_KHDR_TIDCTRL(__le32_to_cpu(p_hdr->khdr.kdeth0)); - if (!tidctrl && ((flowid = ips_proto_flowid(p_hdr)) < EP_FLOW_TIDFLOW)) { -#else if ((flowid = ips_proto_flowid(p_hdr)) < EP_FLOW_TIDFLOW) { -#endif ack_seq_num.psn_num = (ack_seq_num.psn_num - 1) & proto->psn_mask; psmi_assert(flowid < EP_FLOW_LAST); @@ -536,23 +325,14 @@ psm3_ips_proto_process_ack(struct ips_recvhdrq_event *rcv_ev) if (!pio_dma_ack_valid(proto, flow, ack_seq_num)) goto ret; } else { -#ifndef PSM_OPA // we don't put TID (aka RDMA) pkts on UD, shouldn't get ACKs about it _HFI_ERROR("Got ack for invalid flowid\n"); goto ret; -#else - ack_seq_num.psn_seq -= 1; - flow = get_tidflow(proto, ipsaddr, p_hdr, ack_seq_num); - if (!flow) /* Invalid ack for flow */ - goto ret; -#endif } -#ifndef PSM_OPA #ifndef PSM_TCP_ACK // for ack-less TCP we should have acked self-packet before recv reports // the given ack_seq_num psmi_assert(psm3_epid_protocol(proto->ep->epid) != PSMI_ETH_PROTO_TCP); -#endif #endif flow->xmit_ack_num.psn_num = p_hdr->ack_seq_num; @@ -637,9 +417,6 @@ psm3_ips_proto_process_ack(struct ips_recvhdrq_event *rcv_ev) #else _HFI_VDBG("after all ACKed: flow_credits %d\n", flow->credits); -#endif -#ifdef PSM_OPA - flow->flags &= ~IPS_FLOW_FLAG_CONGESTED; #endif goto ret; } else if (flow->timer_ack == scb->timer_ack) { @@ -669,23 +446,7 @@ psm3_ips_proto_process_ack(struct ips_recvhdrq_event *rcv_ev) psmi_assert(!STAILQ_EMPTY(unackedq)); /* sanity for above loop */ -#ifdef PSM_OPA - /* CCA: If flow is congested adjust rate */ - if_pf(rcv_ev->is_congested & IPS_RECV_EVENT_BECN) { - if ((flow->path->opa.pr_ccti + - proto->cace[flow->path->pr_sl].ccti_increase) <= - proto->ccti_limit) { - ips_cca_adjust_rate(flow->path, - proto->cace[flow->path->pr_sl]. - ccti_increase); - /* Clear congestion event */ - rcv_ev->is_congested &= ~IPS_RECV_EVENT_BECN; - } - } - else { -#else { -#endif /* Increase congestion window if flow is not congested */ if_pf(flow->cwin < proto->flow_credits) { // this only happens for OPA, so we don't have to @@ -738,9 +499,6 @@ int psm3_ips_proto_process_nak(struct ips_recvhdrq_event *rcv_ev) psm_protocol_type_t protocol; ips_epaddr_flow_t flowid; ips_scb_t *scb; -#ifdef PSM_OPA - uint32_t tidctrl; -#endif INC_TIME_SPEND(TIME_SPEND_USER3); @@ -748,12 +506,7 @@ int psm3_ips_proto_process_nak(struct ips_recvhdrq_event *rcv_ev) // we are likely to get a previous ack_seq_num in NAK, in which case // we need to resend unacked packets starting with ack_seq_num. So check // psn of 1st NAK would like us to retransmit (e.g. don't -1 before check) -#ifdef PSM_OPA - tidctrl = GET_HFI_KHDR_TIDCTRL(__le32_to_cpu(p_hdr->khdr.kdeth0)); - if (!tidctrl && ((flowid = ips_proto_flowid(p_hdr)) < EP_FLOW_TIDFLOW)) { -#else if ((flowid = ips_proto_flowid(p_hdr)) < EP_FLOW_TIDFLOW) { -#endif protocol = PSM_PROTOCOL_GO_BACK_N; psmi_assert(flowid < EP_FLOW_LAST); flow = &ipsaddr->flows[flowid]; @@ -763,26 +516,9 @@ int psm3_ips_proto_process_nak(struct ips_recvhdrq_event *rcv_ev) (ack_seq_num.psn_num - 1) & proto->psn_mask; flow->xmit_ack_num.psn_num = p_hdr->ack_seq_num; } else { -#ifndef PSM_OPA // we don't put TID (aka RDMA) pkts on UD, shouldn't get NAKs about it _HFI_ERROR("Got nak for invalid flowid\n"); goto ret; -#else - protocol = PSM_PROTOCOL_TIDFLOW; - flow = get_tidflow(proto, ipsaddr, p_hdr, ack_seq_num); - if (!flow) - goto ret; /* Invalid ack for flow */ - ack_seq_num.psn_seq--; - - psmi_assert(flow->xmit_seq_num.psn_gen == ack_seq_num.psn_gen); - psmi_assert(flow->xmit_ack_num.psn_gen == ack_seq_num.psn_gen); - /* Update xmit_ack_num with both new generation and new - * acked sequence; update xmit_seq_num with the new flow - * generation, don't change the sequence number. */ - flow->xmit_ack_num = (psmi_seqnum_t) p_hdr->data[1].u32w0; - flow->xmit_seq_num.psn_gen = flow->xmit_ack_num.psn_gen; - psmi_assert(flow->xmit_seq_num.psn_gen != ack_seq_num.psn_gen); -#endif } unackedq = &flow->scb_unacked; @@ -864,9 +600,6 @@ int psm3_ips_proto_process_nak(struct ips_recvhdrq_event *rcv_ev) #else _HFI_VDBG("after all NAKed: flow_credits %d\n", flow->credits); -#endif -#ifdef PSM_OPA - flow->flags &= ~IPS_FLOW_FLAG_CONGESTED; #endif goto ret; } else if (flow->timer_ack == scb->timer_ack) { @@ -897,14 +630,8 @@ int psm3_ips_proto_process_nak(struct ips_recvhdrq_event *rcv_ev) psmi_assert(!STAILQ_EMPTY(unackedq)); /* sanity for above loop */ if (protocol == PSM_PROTOCOL_TIDFLOW) -#ifndef PSM_OPA // we don't put TID (aka RDMA) pkts on UD, shouldn't get NAKs about it _HFI_ERROR("post processing, Got nak for TID flow, not allowed for UD\n"); -#else - // updates remaining scb's which will be resent - // including new generation - ips_tidflow_nak_post_process(proto, flow); -#endif else if (scb->nfrag > 1) psm3_ips_segmentation_nak_post_process(proto, flow); @@ -936,70 +663,18 @@ int psm3_ips_proto_process_nak(struct ips_recvhdrq_event *rcv_ev) scb = SLIST_NEXT(scb, next); } -#ifdef PSM_OPA - /* If NAK with congestion bit set - delay re-transmitting and THEN adjust - * CCA rate. - */ - if_pf(rcv_ev->is_congested & IPS_RECV_EVENT_BECN) { - uint64_t offset; - - /* Clear congestion event and mark flow as congested */ - rcv_ev->is_congested &= ~IPS_RECV_EVENT_BECN; - flow->flags |= IPS_FLOW_FLAG_CONGESTED; - - /* For congested flow use slow start i.e. reduce congestion window. - * For TIDFLOW we cannot reduce congestion window as peer expects - * header packets at regular intervals (protoexp->hdr_pkt_interval). - */ - if (flow->protocol != PSM_PROTOCOL_TIDFLOW) - flow->credits = flow->cwin = 1; - else - flow->credits = flow->cwin; - // OPA doesn't need flow_credit_bytes nor ack_internal_bytes - // so no change to flow_credit_bytes nor ack_interval_bytes - - flow->ack_interval = max((flow->credits >> 2) - 1, 1); - - /* During congestion cancel send timer and delay retransmission by - * random interval. Can get away with using just 1st epid word - */ - psmi_timer_cancel(proto->timerq, flow->timer_send); - if (SLIST_FIRST(scb_pend)->ack_timeout != TIMEOUT_INFINITE) - offset = (SLIST_FIRST(scb_pend)->ack_timeout >> 1); - else - offset = 0; - struct drand48_data drand48_data; - srand48_r((long int)(psm3_epid_hash(ipsaddr->epaddr.epid) + psm3_epid_hash(proto->ep->epid)), &drand48_data); - double rnum; - drand48_r(&drand48_data, &rnum); - psmi_timer_request(proto->timerq, flow->timer_send, - (get_cycles() + - (uint64_t) (offset * - (rnum + 1.0)))); - } - else { -#else { -#endif int num_resent = 0; /* Reclaim all credits upto congestion window only */ flow->credits = flow->cwin; flow->ack_interval = max((flow->credits >> 2) - 1, 1); #ifdef PSM_BYTE_FLOW_CREDITS -#ifdef PSM_OPA - // on OPA cwin can decrease when get BECN - // but we know how credit_bytes was initialized - // we never decrease ack_interval_bytes for - // congestion, so no need to increase here - flow->credit_bytes = proto->ep->mtu * flow->credits; -#else // TBD cwin not implemented for UD and UDP so can predict // credit_bytes here psmi_assert(flow->cwin == proto->flow_credits); flow->credit_bytes = proto->flow_credit_bytes; flow->ack_interval_bytes = max((flow->credit_bytes >> 2) - 1, 1); -#endif _HFI_VDBG("after reclaim cwin: flow_credits %d\n", flow->credits); #else /* PSM_BYTE_FLOW_CREDITS */ @@ -1033,10 +708,6 @@ psm3_ips_proto_process_err_chk(struct ips_recvhdrq_event *rcv_ev) psmi_assert(flowid < EP_FLOW_LAST); flow = &ipsaddr->flows[flowid]; recvq->proto->epaddr_stats.err_chk_recv++; -#ifdef PSM_OPA - /* Ignore FECN bit since this is the control path */ - rcv_ev->is_congested &= ~IPS_RECV_EVENT_FECN; -#endif seq_num.psn_val = __be32_to_cpu(p_hdr->bth[2]); seq_off = (int16_t) (flow->recv_seq_num.psn_num - seq_num.psn_num); diff --git a/psm3/ptl_ips/ips_recvhdrq.h b/psm3/ptl_ips/ips_recvhdrq.h index f7bf2cf..2e4d957 100644 --- a/psm3/ptl_ips/ips_recvhdrq.h +++ b/psm3/ptl_ips/ips_recvhdrq.h @@ -59,9 +59,6 @@ #include "psm_user.h" #include "ips_proto_params.h" #include "ips_proto_header.h" -#ifdef PSM_OPA -#include "hal_gen1/gen1_types.h" /* get psm3_gen1_cl_idx and psm3_gen1_cl_q, psm3_gen1_rhf_t */ -#endif struct ips_recvhdrq; struct ips_recvhdrq_state; @@ -74,11 +71,6 @@ struct ips_epstate; /* keep current packet, revisit the same packet next time */ #define IPS_RECVHDRQ_REVISIT 2 -#ifdef PSM_OPA -/* CCA related receive events */ -#define IPS_RECV_EVENT_FECN 0x1 -#define IPS_RECV_EVENT_BECN 0x2 -#endif struct ips_recvhdrq_event { struct ips_proto *proto; @@ -88,21 +80,10 @@ struct ips_recvhdrq_event { // we point to the payload part of our recv buffer uint8_t *payload; uint32_t payload_size; -#ifdef PSM_OPA - psm3_gen1_rhf_t gen1_rhf; - uint8_t has_cksum; /* payload has cksum */ - uint8_t is_congested; /* Packet faced congestion */ - psm3_gen1_cl_q gen1_hdr_q; -#endif }; struct ips_recvhdrq_callbacks { int (*callback_packet_unknown) (const struct ips_recvhdrq_event *); -#ifdef PSM_OPA - int (*callback_subcontext) (struct ips_recvhdrq_event *, - uint32_t subcontext); - int (*callback_error) (struct ips_recvhdrq_event *); -#endif }; /* @@ -115,16 +96,6 @@ struct ips_recvhdrq_callbacks { */ #define NO_EAGER_UPDATE ~0U struct ips_recvhdrq_state { -#ifdef PSM_OPA - psm3_gen1_cl_idx hdrq_head; /* software copy of head */ - psm3_gen1_cl_idx rcv_egr_index_head; /* software copy of eager index head */ - uint32_t head_update_interval; /* Header update interval */ - uint32_t num_hdrq_done; /* Num header queue done */ - uint32_t egrq_update_interval; /* Eager buffer update interval */ - uint32_t num_egrq_done; /* num eager buffer done */ - uint32_t hdr_countdown; /* for false-egr-full tracing */ - uint32_t hdrq_cachedlastscan; /* last element to be prescanned */ -#endif }; /* @@ -132,18 +103,8 @@ struct ips_recvhdrq_state { */ struct ips_recvhdrq { struct ips_proto *proto; -#ifdef PSM_OPA - const psmi_context_t *context; /* error handling, epid id, etc. */ - struct ips_recvhdrq_state *state; - uint32_t subcontext; /* messages that don't match subcontext call - * recv_callback_subcontext */ - psm3_gen1_cl_q gen1_cl_hdrq; -#endif /* Header queue handling */ pthread_spinlock_t hdrq_lock; /* Lock for thread-safe polling */ -#ifdef PSM_OPA - uint32_t hdrq_elemlast; /* last element precomputed */ -#endif /* Lookup endpoints epid -> ptladdr (rank)) */ const struct ips_epstate *epstate; @@ -153,9 +114,6 @@ struct ips_recvhdrq { /* List of flows with pending acks for receive queue */ SLIST_HEAD(pending_flows, ips_flow) pending_acks; -#ifdef PSM_OPA - volatile __u64 *spi_status; -#endif }; PSMI_INLINE( diff --git a/psm3/ptl_ips/ips_scb.c b/psm3/ptl_ips/ips_scb.c index 59342b4..05aead8 100644 --- a/psm3/ptl_ips/ips_scb.c +++ b/psm3/ptl_ips/ips_scb.c @@ -71,9 +71,6 @@ psm3_ips_scbctrl_init(psm2_ep_t ep, size_t scb_size; size_t alloc_sz; uintptr_t base, imm_base; -#ifdef PSM_OPA - /* scbc->context = &ep->context; */ -#endif psm2_error_t err = PSM2_OK; psmi_assert_always(numscb > 0); @@ -276,9 +273,6 @@ ips_scb_t *MOCKABLE(psm3_ips_scbctrl_alloc)(struct ips_scbctrl *scbc, int scbnum scb->tidsendc = NULL; scb->callback = NULL; -#ifdef PSM_OPA - scb->tidctrl = 0; -#endif scb->nfrag = 1; scb->frag_size = 0; scb->chunk_size = 0; @@ -349,9 +343,6 @@ ips_scb_t *MOCKABLE(psm3_ips_scbctrl_alloc_tiny)(struct ips_scbctrl *scbc) scb->scb_flags = 0; scb->tidsendc = NULL; scb->callback = NULL; -#ifdef PSM_OPA - scb->tidctrl = 0; -#endif scb->nfrag = 1; scb->frag_size = 0; scb->chunk_size = 0; diff --git a/psm3/ptl_ips/ips_scb.h b/psm3/ptl_ips/ips_scb.h index ccaafb7..8195a26 100644 --- a/psm3/ptl_ips/ips_scb.h +++ b/psm3/ptl_ips/ips_scb.h @@ -93,9 +93,6 @@ STAILQ_HEAD(ips_scb_stailq, ips_scb); SLIST_HEAD(ips_scb_slist, ips_scb); struct ips_scbctrl { -#ifdef PSM_OPA - /* const psmi_context_t *context; */ -#endif /* Send control blocks for each send */ uint32_t scb_num; uint32_t scb_num_cur; @@ -170,17 +167,10 @@ struct ips_scb { /* for nfrag>1, initially nfrag_remaining = nfrag */ uint16_t nfrag_remaining; /* remaining packets to transmit */ uint32_t frag_size; /* max packet size in sequence */ -#ifdef PSM_OPA - uint16_t tidctrl; -#endif #ifdef PSM_HAVE_SDMA uint16_t sdma_outstanding; #endif uint16_t opcode; -#ifdef PSM_OPA - uint16_t tsess_length; - uint32_t *tsess; -#endif #ifdef PSM_HAVE_REG_MR psm3_verbs_mr_t mr; #endif @@ -197,35 +187,12 @@ struct ips_scb { void *cb_param; #if defined(PSM_CUDA) || defined(PSM_ONEAPI) psm2_mq_req_t mq_req; /* back pointer to original request */ -#endif -#ifdef PSM_OPA - /* sdma header place holder, PSM2 code should access - * the psm_hal_sdma_req_info only using the psm3_get_sdma_req_info() - * accessor function. */ - /* - * The size of struct psm_hal_sdma_req_info is variable. (10 bytes for - * GPU-direct and 8 bytes for non GPU-Direct) - * When GPU-Direct feature is used, all 10 bytes of the space is used. - * Otherwise, we only use upto 8 bytes. The usage is controlled by - * psm3_get_sdma_req_info() in ips_proto.h - */ - struct psm_hal_sdma_req_info _DO_NOT_USE_; #endif struct { -#ifdef PSM_OPA - struct psm_hal_pbc pbc; -#endif struct ips_message_header ips_lrh; } PSMI_CACHEALIGN; }; -#ifdef PSM_OPA -/* Make sure pbc is at the right place before the message header */ - -COMPILE_TIME_ASSERT(PBC_ABUTS_IPS_MSG_HDR,(sizeof(struct psm_hal_pbc) == - (size_t) (offsetof(struct ips_scb, ips_lrh) - - offsetof(struct ips_scb, pbc)))); -#endif #if defined(PSM_CUDA) || defined(PSM_ONEAPI) #define IS_TRANSFER_BUF_GPU_MEM(scb) (ips_scb_flags(scb) & IPS_SEND_FLAG_PAYLOAD_BUF_GPU) diff --git a/psm3/ptl_ips/ips_tid.c b/psm3/ptl_ips/ips_tid.c index fd96cf5..e7349dd 100644 --- a/psm3/ptl_ips/ips_tid.c +++ b/psm3/ptl_ips/ips_tid.c @@ -53,229 +53,3 @@ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ -#ifdef PSM_OPA -#include "psm_user.h" -#include "psm2_hal.h" -#include "ips_tid.h" -#include "ips_proto.h" -#include "ips_expected_proto.h" - -psm2_error_t -ips_tid_init(const psmi_context_t *context, struct ips_protoexp *protoexp, - ips_tid_avail_cb_fn_t cb, void *cb_context) -{ - struct ips_tid *tidc = &protoexp->tidc; - - struct psmi_stats_entry entries[] = { - PSMI_STATS_DECL("tid_update_count", MPSPAWN_STATS_REDUCTION_ALL, - NULL, &tidc->tid_num_total), - }; - - tidc->context = context; - tidc->protoexp = protoexp; - tidc->tid_num_total = 0; - tidc->tid_num_inuse = 0; - tidc->tid_avail_cb = cb; - tidc->tid_avail_context = cb_context; - tidc->tid_array = NULL; - - /* - * PSM uses tid registration caching only if driver has enabled it. - */ - if (!psmi_hal_has_cap(PSM_HAL_CAP_TID_UNMAP)) { - int i; - cl_qmap_t *p_map; - cl_map_item_t *root,*nil_item; - - tidc->tid_array = (uint32_t *) - psmi_calloc(context->ep, UNDEFINED, - psmi_hal_get_tid_exp_cnt(context->psm_hw_ctxt), - sizeof(uint32_t)); - if (tidc->tid_array == NULL) - return PSM2_NO_MEMORY; - - /* - * first is root node, last is terminator node. - */ - p_map = &tidc->tid_cachemap; - root = (cl_map_item_t *) - psmi_calloc(context->ep, UNDEFINED, - psmi_hal_get_tid_exp_cnt(context->psm_hw_ctxt) + 2, - sizeof(cl_map_item_t)); - - if (root == NULL) - return PSM2_NO_MEMORY; - - nil_item = &root - [psmi_hal_get_tid_exp_cnt(context->psm_hw_ctxt) + 1]; - - ips_tidcache_map_init(p_map,root,nil_item); - - NTID = 0; - NIDLE = 0; - IPREV(IHEAD) = INEXT(IHEAD) = IHEAD; - for (i = 1; i <= psmi_hal_get_tid_exp_cnt(context->psm_hw_ctxt); i++) { - INVALIDATE(i) = 1; - } - - /* - * if not shared context, all tids are used by the same - * process. Otherwise, subcontext process can only cache - * its own portion. Driver makes the same tid number - * assignment to subcontext processes. - */ - tidc->tid_cachesize = psmi_hal_get_tid_exp_cnt(context->psm_hw_ctxt); - if (psmi_hal_get_subctxt_cnt(context->psm_hw_ctxt) > 0) { - uint16_t remainder = tidc->tid_cachesize % - psmi_hal_get_subctxt_cnt(context->psm_hw_ctxt); - tidc->tid_cachesize /= psmi_hal_get_subctxt_cnt(context->psm_hw_ctxt); - if (psmi_hal_get_subctxt(context->psm_hw_ctxt) < remainder) - tidc->tid_cachesize++; - } - } - - /* - * Setup shared control structure. - */ - tidc->tid_ctrl = (struct ips_tid_ctrl *)context->tid_ctrl; - if (!tidc->tid_ctrl) { - tidc->tid_ctrl = (struct ips_tid_ctrl *) - psmi_calloc(context->ep, UNDEFINED, 1, - sizeof(struct ips_tid_ctrl)); - if (tidc->tid_ctrl == NULL) { - return PSM2_NO_MEMORY; - } - } - - /* - * Only the master process can initialize. - */ - if (psmi_hal_get_subctxt(context->psm_hw_ctxt) == 0) { - pthread_spin_init(&tidc->tid_ctrl->tid_ctrl_lock, - PTHREAD_PROCESS_SHARED); - - tidc->tid_ctrl->tid_num_max = - psmi_hal_get_tid_exp_cnt(context->psm_hw_ctxt); - tidc->tid_ctrl->tid_num_avail = tidc->tid_ctrl->tid_num_max; - } - - return psm3_stats_register_type("TID_Statistics", - PSMI_STATSTYPE_RDMA, - entries, - PSMI_HOWMANY(entries), - psm3_epid_fmt_internal(protoexp->proto->ep->epid, 0), tidc, - protoexp->proto->ep->dev_name); -} - -psm2_error_t ips_tid_fini(struct ips_tid *tidc) -{ - psm3_stats_deregister_type(PSMI_STATSTYPE_RDMA, tidc); - - if (tidc->tid_array) - ips_tidcache_cleanup(tidc); - - if (!tidc->context->tid_ctrl) - psmi_free(tidc->tid_ctrl); - - return PSM2_OK; -} - -psm2_error_t -ips_tid_acquire(struct ips_tid *tidc, - const void *buf, uint32_t *length, - uint32_t *tid_array, uint32_t *tidcnt -#ifdef PSM_CUDA - , uint8_t is_cuda_ptr -#endif - ) -{ - struct ips_tid_ctrl *ctrl = tidc->tid_ctrl; - psm2_error_t err = PSM2_OK; - uint16_t flags = 0; - int rc; - - psmi_assert(((uintptr_t) buf & 0xFFF) == 0); - psmi_assert(((*length) & 0xFFF) == 0); - - if (tidc->context->tid_ctrl) - pthread_spin_lock(&ctrl->tid_ctrl_lock); - - if (!ctrl->tid_num_avail) { - err = PSM2_EP_NO_RESOURCES; - goto fail; - } - - /* Clip length if it exceeds worst case tid allocation, - where each entry in the tid array can accommodate only - 1 page. */ - if (*length > 4096*tidc->tid_ctrl->tid_num_max) - { - *length = 4096*tidc->tid_ctrl->tid_num_max; - } - -#ifdef PSM_CUDA - if (is_cuda_ptr) - flags = PSM_HAL_BUF_GPU_MEM; -#endif - - rc = psmi_hal_update_tid(tidc->context->psm_hw_ctxt, - (uint64_t) (uintptr_t) buf, length, - (uint64_t) (uintptr_t) tid_array, tidcnt, flags); - - if (rc < 0) { - /* Unable to pin pages? retry later */ - err = PSM2_EP_DEVICE_FAILURE; - goto fail; - } - - psmi_assert_always((*tidcnt) > 0); - psmi_assert(ctrl->tid_num_avail >= (*tidcnt)); - ctrl->tid_num_avail -= (*tidcnt); - tidc->tid_num_total += (*tidcnt); - tidc->tid_num_inuse += (*tidcnt); - -fail: - if (tidc->context->tid_ctrl) - pthread_spin_unlock(&ctrl->tid_ctrl_lock); - - return err; -} - -psm2_error_t -ips_tid_release(struct ips_tid *tidc, - uint32_t *tid_array, uint32_t tidcnt) -{ - struct ips_tid_ctrl *ctrl = tidc->tid_ctrl; - psm2_error_t err = PSM2_OK; - - psmi_assert(tidcnt > 0); - if (tidc->context->tid_ctrl) - pthread_spin_lock(&ctrl->tid_ctrl_lock); - - if (psmi_hal_free_tid(tidc->context->psm_hw_ctxt, - (uint64_t) (uintptr_t) tid_array, tidcnt) < 0) { - if (tidc->context->tid_ctrl) - pthread_spin_unlock(&ctrl->tid_ctrl_lock); - - /* If failed to unpin pages, it's fatal error */ - err = psm3_handle_error(tidc->context->ep, - PSM2_EP_DEVICE_FAILURE, - "Failed to tid free %d tids", - tidcnt); - goto fail; - } - - ctrl->tid_num_avail += tidcnt; - if (tidc->context->tid_ctrl) - pthread_spin_unlock(&ctrl->tid_ctrl_lock); - - tidc->tid_num_inuse -= tidcnt; - /* If an available callback is registered invoke it */ - if (((tidc->tid_num_inuse + tidcnt) == ctrl->tid_num_max) - && tidc->tid_avail_cb) - tidc->tid_avail_cb(tidc, tidc->tid_avail_context); - -fail: - return err; -} -#endif // PSM_OPA diff --git a/psm3/ptl_ips/ips_tid.h b/psm3/ptl_ips/ips_tid.h index e98d750..6d31def 100644 --- a/psm3/ptl_ips/ips_tid.h +++ b/psm3/ptl_ips/ips_tid.h @@ -58,99 +58,4 @@ #ifndef _IPS_TID_H #define _IPS_TID_H -#ifdef PSM_OPA -#include "psm_user.h" -#include "ips_tidcache.h" - -struct ips_tid; - -typedef void (*ips_tid_avail_cb_fn_t) (struct ips_tid *, void *context); - -struct ips_tid_ctrl { - pthread_spinlock_t tid_ctrl_lock; - uint32_t tid_num_max; - uint32_t tid_num_avail; -} __attribute__ ((aligned(64))); - -struct ips_tid { - const psmi_context_t *context; - struct ips_protoexp *protoexp; - - void *tid_avail_context; - struct ips_tid_ctrl *tid_ctrl; - - ips_tid_avail_cb_fn_t tid_avail_cb; - uint64_t tid_num_total; - uint32_t tid_num_inuse; - uint32_t tid_cachesize; /* items can be cached */ - cl_qmap_t tid_cachemap; /* RB tree implementation */ - /* - * tids storage. - * This is used in tid registration caching case for - * tid invalidation, acquire, replace and release, - * entries should be the assigned tid number. - */ - uint32_t *tid_array; -}; - -psm2_error_t ips_tid_init(const psmi_context_t *context, - struct ips_protoexp *protoexp, - ips_tid_avail_cb_fn_t cb, void *cb_context); -psm2_error_t ips_tid_fini(struct ips_tid *tidc); - -/* Acquiring tids. - * Buffer base has to be aligned on page boundary - * Buffer length has to be multiple pages - */ -psm2_error_t ips_tidcache_acquire(struct ips_tid *tidc, - const void *buf, /* input buffer, aligned to page boundary */ - uint32_t *length, /* buffer length, aligned to page size */ - uint32_t *tid_array, /* output tidarray, */ - uint32_t *tidcnt, /* output of tid count */ - uint32_t *pageoff /* output of offset in first tid */ -#ifdef PSM_CUDA - , uint8_t is_cuda_ptr -#endif - ); - -psm2_error_t ips_tidcache_release(struct ips_tid *tidc, - uint32_t *tid_array, /* input tidarray, */ - uint32_t tidcnt); /* input of tid count */ - -psm2_error_t ips_tidcache_cleanup(struct ips_tid *tidc); -psm2_error_t ips_tidcache_invalidation(struct ips_tid *tidc); - -psm2_error_t ips_tid_acquire(struct ips_tid *tidc, - const void *buf, /* input buffer, aligned to page boundary */ - uint32_t *length, /* buffer length, aligned to page size */ - uint32_t *tid_array, /* output tidarray, */ - uint32_t *tidcnt -#ifdef PSM_CUDA - , uint8_t is_cuda_ptr -#endif - ); /* output of tid count */ - -psm2_error_t ips_tid_release(struct ips_tid *tidc, - uint32_t *tid_array, /* input tidarray, */ - uint32_t tidcnt); /* input of tid count */ - -PSMI_INLINE(int ips_tid_num_available(struct ips_tid *tidc)) -{ - if (tidc->tid_ctrl->tid_num_avail == 0) { - if (tidc->tid_ctrl->tid_num_max == tidc->tid_num_inuse) - return -1; - else - return 0; - } - - return tidc->tid_ctrl->tid_num_avail; -} - -/* Note that the caller is responsible for making sure that NIDLE is non-zero - before calling ips_tidcache_evict. If NIDLE is 0 at the time of call, - ips_tidcache_evict is unstable. - */ -uint64_t ips_tidcache_evict(struct ips_tid *tidc, uint64_t length); - -#endif // PSM_OPA #endif /* _IPS_TID_H */ diff --git a/psm3/ptl_ips/ips_tidcache.c b/psm3/ptl_ips/ips_tidcache.c index 7c04d87..f7588b8 100644 --- a/psm3/ptl_ips/ips_tidcache.c +++ b/psm3/ptl_ips/ips_tidcache.c @@ -51,635 +51,3 @@ */ -#ifdef PSM_OPA -#include "psm_user.h" -#include "psm2_hal.h" -#include "ips_proto.h" -#include "ips_expected_proto.h" - -#define RBTREE_GET_LEFTMOST(PAYLOAD_PTR) ((PAYLOAD_PTR)->start) -#define RBTREE_GET_RIGHTMOST(PAYLOAD_PTR) ((PAYLOAD_PTR)->start+((PAYLOAD_PTR)->length<<12)) -#define RBTREE_ASSERT psmi_assert -#define RBTREE_MAP_COUNT(PAYLOAD_PTR) ((PAYLOAD_PTR)->ntid) - -#include "psm3_rbtree.c" - -void ips_tidcache_map_init(cl_qmap_t *p_map, - cl_map_item_t* const root, - cl_map_item_t* const nil_item) -{ - ips_cl_qmap_init(p_map,root,nil_item); -} - -/* - * - * Force to remove a tid, check invalidation event afterwards. - */ -static psm2_error_t -ips_tidcache_remove(struct ips_tid *tidc, uint32_t tidcnt) -{ - cl_qmap_t *p_map = &tidc->tid_cachemap; - uint32_t idx; - uint64_t events_mask; - psm2_error_t err; - - /* - * call driver to free the tids. - */ - if (psmi_hal_free_tid(tidc->context->psm_hw_ctxt, - (uint64_t) (uintptr_t) tidc->tid_array, tidcnt) < 0) { - /* If failed to unpin pages, it's fatal error */ - err = psm3_handle_error(tidc->context->ep, - PSM2_EP_DEVICE_FAILURE, - "Failed to tid free %d tids", 1); - return err; - } - - while (tidcnt) { - tidcnt--; - idx = 2*IPS_TIDINFO_GET_TID(tidc->tid_array[tidcnt]) + - IPS_TIDINFO_GET_TIDCTRL(tidc->tid_array[tidcnt]); - - /* - * sanity check. - */ - psmi_assert(idx != 0); - psmi_assert(idx <= tidc->tid_ctrl->tid_num_max); - psmi_assert(INVALIDATE(idx) == 0); - psmi_assert(REFCNT(idx) == 0); - - /* - * mark the tid invalidated. - */ - INVALIDATE(idx) = 1; - - /* - * remove the tid from RB tree. - */ - IDLE_REMOVE(idx); - ips_cl_qmap_remove_item(p_map, &p_map->root[idx]); - } - - /* - * Because the freed tid is not from invalidation list, - * it is possible that kernel just invalidated the tid, - * then we need to check and process the invalidation - * before we can re-use this tid. The reverse order - * will wrongly invalidate this tid again. - */ - err = psmi_hal_get_hfi_event_bits(&events_mask,tidc->context->psm_hw_ctxt); - - if_pf (err) - return PSM2_INTERNAL_ERR; - - if (events_mask & PSM_HAL_HFI_EVENT_TID_MMU_NOTIFY) { - err = ips_tidcache_invalidation(tidc); - if (err) - return err; - } - - return PSM2_OK; -} - -/* - * Register a new buffer with driver, and cache the tidinfo. - */ -static psm2_error_t -ips_tidcache_register(struct ips_tid *tidc, - unsigned long start, uint32_t length, uint32_t *firstidx -#ifdef PSM_CUDA - , uint8_t is_cuda_ptr -#endif - ) -{ - cl_qmap_t *p_map = &tidc->tid_cachemap; - uint32_t tidoff, tidlen; - uint32_t idx, tidcnt; - uint16_t flags = 0; - psm2_error_t err; - - /* - * make sure we have at least one free tid to - * register the new buffer. - */ - if (NTID == tidc->tid_cachesize) { - /* all tids are in active use, error? */ - if (NIDLE == 0) - return PSM2_OK_NO_PROGRESS; - - /* - * free the first tid in idle queue. - */ - idx = IPREV(IHEAD); - tidc->tid_array[0] = p_map->root[idx].payload.tidinfo; - err = ips_tidcache_remove(tidc, 1); - if (err) - return err; - } - psmi_assert(NTID < tidc->tid_cachesize); - - /* Clip length if it exceeds worst case tid allocation, - where each entry in the tid array can accommodate only - 1 page. */ - if (length > 4096*tidc->tid_ctrl->tid_num_max) - { - length = 4096*tidc->tid_ctrl->tid_num_max; - } - /* - * register the new buffer. - */ - -retry: - tidcnt = 0; - -#ifdef PSM_CUDA - if (is_cuda_ptr) - flags = PSM_HAL_BUF_GPU_MEM; -#endif - - if (psmi_hal_update_tid(tidc->context->psm_hw_ctxt, - (uint64_t) start, &length, - (uint64_t) tidc->tid_array, &tidcnt, - flags) < 0) { - /* if driver reaches lockable memory limit */ - if ((errno == ENOMEM -#ifdef PSM_CUDA - /* This additional check is in place for just the cuda - * version. It is a temporary workaround for a known - * issue where nvidia driver returns EINVAL instead of - * ENOMEM when there is no BAR1 space left to pin pages. - * PSM frees tidcache enteries when the driver sends - * EINVAL there by unpinning pages and freeing some - * BAR1 space.*/ - || (PSMI_IS_GPU_ENABLED && PSMI_IS_GPU_MEM((void*)start) && errno == EINVAL) -#endif - ) && NIDLE) { - uint64_t lengthEvicted = ips_tidcache_evict(tidc,length); - - if (lengthEvicted >= length) - goto retry; - } else if (errno == EFAULT) - psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, - " Unhandled error in TID Update: %s\n", strerror(errno)); -#ifdef PSM_CUDA - else if (PSMI_IS_GPU_ENABLED && errno == ENOTSUP) - psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, - " Nvidia driver apis mismatch: %s\n", strerror(errno)); -#endif - - /* Unable to pin pages? retry later */ - return PSM2_EP_DEVICE_FAILURE; - } - psmi_assert_always(tidcnt > 0); - psmi_assert((tidcnt+NTID) <= tidc->tid_cachesize); - - /* - * backward processing because we want to return - * the first RB index in the array. - */ - idx = 0; - tidoff = length; - while (tidcnt) { - /* - * Driver only returns tidctrl=1 or tidctrl=2. - */ - tidcnt--; - idx = 2*IPS_TIDINFO_GET_TID(tidc->tid_array[tidcnt]) + - IPS_TIDINFO_GET_TIDCTRL(tidc->tid_array[tidcnt]); - tidlen = IPS_TIDINFO_GET_LENGTH(tidc->tid_array[tidcnt]); - - /* - * sanity check. - */ - psmi_assert(idx != 0); - psmi_assert(idx <= tidc->tid_ctrl->tid_num_max); - psmi_assert(INVALIDATE(idx) != 0); - psmi_assert(REFCNT(idx) == 0); - - /* - * clear the tid invalidated. - */ - INVALIDATE(idx) = 0; - - /* - * put the tid into a RB node. - */ - tidoff -= tidlen << 12; - START(idx) = start + tidoff; - LENGTH(idx) = tidlen; - p_map->root[idx].payload.tidinfo = tidc->tid_array[tidcnt]; - - /* - * put the node into RB tree and idle queue head. - */ - IDLE_INSERT(idx); - ips_cl_qmap_insert_item(p_map, &p_map->root[idx]); - } - psmi_assert(idx != 0); - psmi_assert(tidoff == 0); - *firstidx = idx; - - return PSM2_OK; -} - -/* - * Get mmu notifier invalidation info and update PSM's caching. - */ -psm2_error_t -ips_tidcache_invalidation(struct ips_tid *tidc) -{ - cl_qmap_t *p_map = &tidc->tid_cachemap; - uint32_t i, j, idx, tidcnt; - psm2_error_t err; - - /* - * get a list of invalidated tids from driver, - * driver will clear the event bit before return. - */ - tidcnt = 0; - if (psmi_hal_get_tidcache_invalidation(tidc->context->psm_hw_ctxt, - (uint64_t) (uintptr_t) tidc->tid_array, - &tidcnt) < 0) { - /* If failed to get invalidation info, it's fatal error */ - err = psm3_handle_error(tidc->context->ep, - PSM2_EP_DEVICE_FAILURE, - "Failed to get invalidation info"); - return err; - } - psmi_assert(tidcnt > 0 && tidcnt <= tidc->tid_ctrl->tid_num_max); - - j = 0; - for (i = 0; i < tidcnt; i++) { - /* - * Driver only returns tidctrl=1 or tidctrl=2. - */ - idx = 2*IPS_TIDINFO_GET_TID(tidc->tid_array[i]) + - IPS_TIDINFO_GET_TIDCTRL(tidc->tid_array[i]); - psmi_assert(idx != 0); - psmi_assert(idx <= tidc->tid_ctrl->tid_num_max); - - /* - * sanity check. - */ -#if 0 - /* disabled this assert since observed it on OPA debug build on - * nVidia gv100 GPU with small BAR space. When disabled OSU tests - * and mpi_stress all worked fine. Suspect the assert is inaccurate - * and since it's for OPA code, not worth further debug. Did attempt - * placing the assert after the INVALIDATE test below and it still - * failed. - */ - psmi_assert(p_map->root[idx].payload.tidinfo == tidc->tid_array[i]); - psmi_assert(LENGTH(idx) == - IPS_TIDINFO_GET_LENGTH(tidc->tid_array[i])); -#endif - - /* - * if the tid is already invalidated, ignore it, - * but do sanity check. - */ - if (INVALIDATE(idx) != 0) { - psmi_assert(REFCNT(idx) == 0); - continue; - } - - /* - * mark the tid invalidated. - */ - INVALIDATE(idx) = 1; - - /* - * if the tid is idle, remove the tid from RB tree - * and idle queue, put on free list. - */ - if (REFCNT(idx) == 0) { - IDLE_REMOVE(idx); - ips_cl_qmap_remove_item(p_map, &p_map->root[idx]); - - if (i != j) - tidc->tid_array[j] = tidc->tid_array[i]; - j++; - } - } - - if (j > 0) { - /* - * call driver to free the tids. - */ - if (psmi_hal_free_tid(tidc->context->psm_hw_ctxt, - (uint64_t) (uintptr_t) tidc->tid_array, j) < 0) { - /* If failed to unpin pages, it's fatal error */ - err = psm3_handle_error(tidc->context->ep, - PSM2_EP_DEVICE_FAILURE, - "Failed to tid free %d tids", j); - return err; - } - } - - return PSM2_OK; -} - -psm2_error_t -ips_tidcache_acquire(struct ips_tid *tidc, - const void *buf, uint32_t *length, - uint32_t *tid_array, uint32_t *tidcnt, - uint32_t *tidoff -#ifdef PSM_CUDA - , uint8_t is_cuda_ptr -#endif - ) -{ - cl_qmap_t *p_map = &tidc->tid_cachemap; - cl_map_item_t *p_item; - unsigned long start = (unsigned long)buf; - unsigned long end = start + (*length); - uint32_t idx, nbytes; - uint64_t event_mask; - psm2_error_t err; - - /* - * Before every tid caching search, we need to update the - * tid caching if there is invalidation event, otherwise, - * the cached address may be invalidated and we might have - * wrong matching. - */ - err = psmi_hal_get_hfi_event_bits(&event_mask,tidc->context->psm_hw_ctxt); - - if_pf (err) - return PSM2_INTERNAL_ERR; - - if (event_mask & PSM_HAL_HFI_EVENT_TID_MMU_NOTIFY) { - err = ips_tidcache_invalidation(tidc); - if (err) - return err; - } - - /* - * Now we can do matching from the caching, because obsolete - * address in caching has been removed or identified. - */ -retry: - p_item = ips_cl_qmap_search(p_map, start, end); - idx = 2*IPS_TIDINFO_GET_TID(p_item->payload.tidinfo) + - IPS_TIDINFO_GET_TIDCTRL(p_item->payload.tidinfo); - - /* - * There is tid matching. - */ - if (idx) { - /* - * if there is a caching match, but the tid has been - * invalidated, we can't match this tid, and we also - * can't register this address, we need to wait this - * tid to be freed. - */ - if (INVALIDATE(idx) != 0) - return PSM2_OK_NO_PROGRESS; - - /* - * if the page offset within the tid is not less than - * 128K, the address offset within the page is not 64B - * multiple, PSM can't handle this tid with any offset - * mode. We need to free this tid and re-register with - * the asked page address. - */ - if (((start - START(idx)) >= 131072) && ((*tidoff) & 63)) { - /* - * If the tid is currently used, retry later. - */ - if (REFCNT(idx) != 0) - return PSM2_OK_NO_PROGRESS; - - /* - * free this tid. - */ - tidc->tid_array[0] = p_map->root[idx].payload.tidinfo; - err = ips_tidcache_remove(tidc, 1); - if (err) - return err; - - /* try to match a node again */ - goto retry; - } - } - - /* - * If there is no match node, or 'start' falls out of node range, - * whole or partial buffer from 'start' is not registered yet. - */ - if (!idx || START(idx) > start) { - if (!idx) - nbytes = end - start; - else - nbytes = START(idx) - start; - - /* - * Because we don't have any match tid yet, if - * there is an error, we return from here, PSM - * will try later. - */ - err = ips_tidcache_register(tidc, start, nbytes, &idx -#ifdef PSM_CUDA - , is_cuda_ptr -#endif - ); - if (err) - return err; - } - - /* - * sanity check. - */ - psmi_assert(START(idx) <= start); - psmi_assert(INVALIDATE(idx) == 0); - - *tidoff += start - START(idx); - *tidcnt = 1; - - tid_array[0] = p_map->root[idx].payload.tidinfo; - REFCNT(idx)++; - if (REFCNT(idx) == 1) - IDLE_REMOVE(idx); - start = END(idx); - - while (start < end) { - p_item = ips_cl_qmap_successor(p_map, &p_map->root[idx]); - idx = 2*IPS_TIDINFO_GET_TID(p_item->payload.tidinfo) + - IPS_TIDINFO_GET_TIDCTRL(p_item->payload.tidinfo); - if (!idx || START(idx) != start) { - if (!idx) - nbytes = end - start; - else - nbytes = (START(idx) > end) ? - (end - start) : - (START(idx) - start); - - /* - * Because we already have at least one match tid, - * if it is error to register new pages, we break - * here and return the tids we already have. - */ - err = ips_tidcache_register(tidc, start, nbytes, &idx -#ifdef PSM_CUDA - , is_cuda_ptr -#endif - ); - if (err) - break; - } else if (INVALIDATE(idx) != 0) { - /* - * the tid has been invalidated, it is still in - * caching because it is still being used, but - * any new usage is not allowed, we ignore it and - * return the tids we already have. - */ - psmi_assert(REFCNT(idx) != 0); - break; - } - - /* - * sanity check. - */ - psmi_assert(START(idx) == start); - psmi_assert(INVALIDATE(idx) == 0); - - tid_array[(*tidcnt)++] = p_map->root[idx].payload.tidinfo; - REFCNT(idx)++; - if (REFCNT(idx) == 1) - IDLE_REMOVE(idx); - start = END(idx); - } - - if (start < end) - *length = start - (unsigned long)buf; - /* otherwise, all pages are registered */ - psmi_assert((*tidcnt) > 0); - - return PSM2_OK; -} - -psm2_error_t -ips_tidcache_release(struct ips_tid *tidc, - uint32_t *tid_array, uint32_t tidcnt) -{ - cl_qmap_t *p_map = &tidc->tid_cachemap; - uint32_t i, j, idx; - psm2_error_t err; - - psmi_assert(tidcnt > 0); - - j = 0; - for (i = 0; i < tidcnt; i++) { - /* - * Driver only returns tidctrl=1 or tidctrl=2. - */ - idx = 2*IPS_TIDINFO_GET_TID(tid_array[i]) + - IPS_TIDINFO_GET_TIDCTRL(tid_array[i]); - psmi_assert(idx != 0); - psmi_assert(idx <= tidc->tid_ctrl->tid_num_max); - psmi_assert(REFCNT(idx) != 0); - - REFCNT(idx)--; - if (REFCNT(idx) == 0) { - if (INVALIDATE(idx) != 0) { - ips_cl_qmap_remove_item(p_map, &p_map->root[idx]); - - tidc->tid_array[j] = tid_array[i]; - j++; - } else { - IDLE_INSERT(idx); - } - } - } - - if (j > 0) { - /* - * call driver to free the tids. - */ - if (psmi_hal_free_tid(tidc->context->psm_hw_ctxt, - (uint64_t) (uintptr_t) tidc->tid_array, j) < 0) { - /* If failed to unpin pages, it's fatal error */ - err = psm3_handle_error(tidc->context->ep, - PSM2_EP_DEVICE_FAILURE, - "Failed to tid free %d tids", j); - return err; - } - } - - return PSM2_OK; -} - -/* - * - * Call driver to free all cached tids. - */ -psm2_error_t -ips_tidcache_cleanup(struct ips_tid *tidc) -{ - cl_qmap_t *p_map = &tidc->tid_cachemap; - psm2_error_t err; - int i, j; - - j = 0; - for (i = 1; i <= tidc->tid_ctrl->tid_num_max; i++) { - psmi_assert(REFCNT(i) == 0); - if (INVALIDATE(i) == 0) { - tidc->tid_array[j++] = p_map->root[i].payload.tidinfo; - } - } - - if (j > 0) { - /* - * call driver to free the tids. - */ - if (psmi_hal_free_tid(tidc->context->psm_hw_ctxt, - (uint64_t) (uintptr_t) tidc->tid_array, j) < 0) { - /* If failed to unpin pages, it's fatal error */ - err = psm3_handle_error(tidc->context->ep, - PSM2_EP_DEVICE_FAILURE, - "Failed to tid free %d tids", j); - return err; - } - } - - psmi_free(tidc->tid_array); - psmi_free(tidc->tid_cachemap.root); - - return PSM2_OK; -} - - -/* Note that the caller is responsible for making sure that NIDLE is non-zero - before calling ips_tidcache_evict. If NIDLE is 0 at the time of call, - ips_tidcache_evict is unstable. - */ -uint64_t -ips_tidcache_evict(struct ips_tid *tidc,uint64_t length) -{ - cl_qmap_t *p_map = &tidc->tid_cachemap; - uint32_t idx = IHEAD, tidcnt = 0, tidlen = 0; - /* - * try to free the required - * pages from idle queue tids - */ - - do { - idx = IPREV(idx); - psmi_assert(idx != 0); - tidc->tid_array[tidcnt] = - p_map->root[idx].payload.tidinfo; - tidcnt++; - - tidlen += IPS_TIDINFO_GET_LENGTH - (p_map->root[idx].payload.tidinfo)<<12; - } while (tidcnt < NIDLE && tidlen < length); - - /* - * free the selected tids on successfully finding some:. - */ - if (tidcnt > 0 && ips_tidcache_remove(tidc, tidcnt)) - return 0; - - return tidlen; -} -#endif // PSM_OPA diff --git a/psm3/ptl_ips/ips_tidflow.c b/psm3/ptl_ips/ips_tidflow.c index 65ed7bc..6187835 100644 --- a/psm3/ptl_ips/ips_tidflow.c +++ b/psm3/ptl_ips/ips_tidflow.c @@ -75,9 +75,6 @@ psm2_error_t psm3_ips_tf_init(struct ips_protoexp *protoexp, { int tf_idx; psm2_ep_t ep = protoexp->proto->ep; -#ifdef PSM_OPA - psmi_context_t *context = &ep->context; -#endif #if TF_ADD struct psmi_stats_entry entries[] = { @@ -87,22 +84,11 @@ psm2_error_t psm3_ips_tf_init(struct ips_protoexp *protoexp, }; #endif -#ifdef PSM_OPA - tfc->context = context; -#endif tfc->tf_num_total = 0; tfc->tf_num_inuse = 0; tfc->tf_avail_cb = cb; tfc->tf_avail_context = (void *)protoexp; -#ifndef PSM_OPA tfc->tf_gen_mask = 0xFFFFF; -#else - if (psmi_hal_has_cap(PSM_HAL_CAP_EXTENDED_PSN)) { - tfc->tf_gen_mask = 0xFFFFF; - } else { - tfc->tf_gen_mask = 0x1FFF; - } -#endif /* Allocate and Initialize tidrecvc array. */ tfc->tidrecvc = (struct ips_tid_recv_desc *) @@ -112,26 +98,12 @@ psm2_error_t psm3_ips_tf_init(struct ips_protoexp *protoexp, return PSM2_NO_MEMORY; for (tf_idx = 0; tf_idx < HFI_TF_NFLOWS; tf_idx++) { -#ifdef PSM_OPA - tfc->tidrecvc[tf_idx].context = context; -#endif tfc->tidrecvc[tf_idx].protoexp = protoexp; tfc->tidrecvc[tf_idx].rdescid._desc_idx = tf_idx; tfc->tidrecvc[tf_idx].rdescid._desc_genc = tf_idx; -#ifdef PSM_OPA - tfc->tidrecvc[tf_idx].tidflow.flowid = EP_FLOW_TIDFLOW; - tfc->tidrecvc[tf_idx].tidflow.frag_size = protoexp->proto->epinfo.ep_mtu; -#endif } -#ifdef PSM_OPA - /* Shared control structure, it will be in shared memory - * for context sharing, otherwise calloc() it */ - tfc->tf_ctrl = (struct ips_tf_ctrl *)context->tf_ctrl; - if (!tfc->tf_ctrl) { -#else { -#endif tfc->tf_ctrl = (struct ips_tf_ctrl *) psmi_calloc(ep, UNDEFINED, 1, sizeof(struct ips_tf_ctrl)); @@ -143,13 +115,7 @@ psm2_error_t psm3_ips_tf_init(struct ips_protoexp *protoexp, /* * Only the master process can initialize. */ -#ifdef PSM_OPA - if (psmi_hal_get_subctxt(context->psm_hw_ctxt) == 0) { - pthread_spin_init(&tfc->tf_ctrl->tf_ctrl_lock, - PTHREAD_PROCESS_SHARED); -#else { -#endif tfc->tf_ctrl->tf_num_max = HFI_TF_NFLOWS; tfc->tf_ctrl->tf_num_avail = HFI_TF_NFLOWS; @@ -170,10 +136,6 @@ psm2_error_t psm3_ips_tf_init(struct ips_protoexp *protoexp, tfc->tf_ctrl->tf[tf_idx].next_free = tf_idx + 1; #endif -#ifdef PSM_OPA - psmi_hal_tidflow_reset(tfc->context->psm_hw_ctxt, tf_idx, - tfc->tf_gen_mask, 0x7FF); -#endif } #if 1 for (tf_idx = 0; tf_idx < HFI_TF_NFLOWS; tf_idx++) { @@ -199,9 +161,6 @@ psm2_error_t psm3_ips_tf_init(struct ips_protoexp *protoexp, psm2_error_t psm3_ips_tf_fini(struct ips_tf *tfc) { psm3_stats_deregister_type(PSMI_STATSTYPE_RDMA, tfc); -#ifdef PSM_OPA - if (!tfc->context->tf_ctrl) -#endif psmi_free(tfc->tf_ctrl); psmi_free(tfc->tidrecvc); return PSM2_OK; @@ -214,20 +173,11 @@ psm2_error_t psm3_ips_tf_allocate(struct ips_tf *tfc, struct ips_tf_ctrl *ctrl = tfc->tf_ctrl; struct ips_tf_entry *entry; -#ifdef PSM_OPA - // shared context needs lock - if (tfc->context->tf_ctrl) - pthread_spin_lock(&ctrl->tf_ctrl_lock); -#endif if (!ctrl->tf_num_avail) { psmi_assert(ctrl->tf_head == HFI_TF_NFLOWS); *tidrecvc = NULL; -#ifdef PSM_OPA - if (tfc->context->tf_ctrl) - pthread_spin_unlock(&ctrl->tf_ctrl_lock); -#endif return PSM2_EP_NO_RESOURCES; } @@ -236,10 +186,6 @@ psm2_error_t psm3_ips_tf_allocate(struct ips_tf *tfc, ctrl->tf_head = entry->next_free; ctrl->tf_num_avail--; -#ifdef PSM_OPA - if (tfc->context->tf_ctrl) - pthread_spin_unlock(&ctrl->tf_ctrl_lock); -#endif tfc->tf_num_total++; tfc->tf_num_inuse++; @@ -254,11 +200,6 @@ psm2_error_t psm3_ips_tf_allocate(struct ips_tf *tfc, psmi_assert((*tidrecvc)->rdescid._desc_idx == entry->tf_idx); psmi_assert_always(entry->next_gen < tfc->tf_gen_mask); -#ifdef PSM_OPA - entry->next_gen++; - if (entry->next_gen == tfc->tf_gen_mask) - entry->next_gen = 0; -#endif return PSM2_OK; } @@ -275,17 +216,6 @@ psm2_error_t psm3_ips_tf_deallocate(struct ips_tf *tfc, uint32_t tf_idx, int use psmi_assert(entry->state == TF_STATE_ALLOCATED); entry->state = TF_STATE_DEALLOCATED; -#ifdef PSM_OPA - /* - * The wire protocol only uses 16bits tidrecvc generation - * count in exptid packet, this should be bigger enough, - * u16w3 is the lower 16bits of _desc_genc - */ - tfc->tidrecvc[tf_idx].rdescid.u16w3++; - /* Mark invalid generation for flow (stale packets will be dropped) */ - psmi_hal_tidflow_reset(tfc->context->psm_hw_ctxt, tf_idx, - tfc->tf_gen_mask, 0x7FF); -#else if (used) { entry->next_gen++; if (entry->next_gen == tfc->tf_gen_mask) @@ -296,22 +226,12 @@ psm2_error_t psm3_ips_tf_deallocate(struct ips_tf *tfc, uint32_t tf_idx, int use */ tfc->tidrecvc[tf_idx].rdescid.u32w1++; } -#endif -#ifdef PSM_OPA - // shared context needs lock - if (tfc->context->tf_ctrl) - pthread_spin_lock(&ctrl->tf_ctrl_lock); -#endif entry->next_free = ctrl->tf_head; ctrl->tf_head = tf_idx; ctrl->tf_num_avail++; -#ifdef PSM_OPA - if (tfc->context->tf_ctrl) - pthread_spin_unlock(&ctrl->tf_ctrl_lock); -#endif tfc->tf_num_inuse--; /* If an available callback is registered invoke it */ @@ -321,28 +241,3 @@ psm2_error_t psm3_ips_tf_deallocate(struct ips_tf *tfc, uint32_t tf_idx, int use return PSM2_OK; } -#ifdef PSM_OPA -/* Allocate a generation for a flow */ -psm2_error_t ips_tfgen_allocate(struct ips_tf *tfc, - uint32_t tf_idx, uint32_t *tfgen) -{ - struct ips_tf_entry *entry; - int ret = PSM2_OK; - - psmi_assert(tf_idx < HFI_TF_NFLOWS); - psmi_assert(tf_idx >= 0); - - entry = &tfc->tf_ctrl->tf[tf_idx]; - psmi_assert(entry->state == TF_STATE_ALLOCATED); - - *tfgen = entry->next_gen; - - entry->next_gen++; - if (entry->next_gen == tfc->tf_gen_mask) - entry->next_gen = 0; - - psmi_assert_always(*tfgen < tfc->tf_gen_mask); - - return ret; -} -#endif diff --git a/psm3/ptl_ips/ips_tidflow.h b/psm3/ptl_ips/ips_tidflow.h index bfa6efb..f3c2935 100644 --- a/psm3/ptl_ips/ips_tidflow.h +++ b/psm3/ptl_ips/ips_tidflow.h @@ -78,9 +78,6 @@ struct ips_tf_entry { }; struct ips_tf_ctrl { -#ifdef PSM_OPA - pthread_spinlock_t tf_ctrl_lock; // for shared context */ -#endif uint32_t tf_num_max; uint32_t tf_num_avail; uint32_t tf_head; @@ -88,9 +85,6 @@ struct ips_tf_ctrl { } __attribute__ ((aligned(64))); struct ips_tf { -#ifdef PSM_OPA - const psmi_context_t *context; -#endif ips_tf_avail_cb_fn_t tf_avail_cb; void *tf_avail_context; struct ips_tf_ctrl *tf_ctrl; @@ -127,10 +121,5 @@ psm2_error_t psm3_ips_tf_allocate(struct ips_tf *tfc, /* Deallocate a tidflow */ psm2_error_t psm3_ips_tf_deallocate(struct ips_tf *tfc, uint32_t tf_idx, int used); -#ifdef PSM_OPA -/* Allocate a generation for a flow */ -psm2_error_t ips_tfgen_allocate(struct ips_tf *tfc, - uint32_t tf_idx, uint32_t *tfgen); -#endif #endif diff --git a/psm3/ptl_ips/ptl.c b/psm3/ptl_ips/ptl.c index 8491bd4..3602ed2 100644 --- a/psm3/ptl_ips/ptl.c +++ b/psm3/ptl_ips/ptl.c @@ -340,10 +340,6 @@ ips_ptl_optctl(const void *core_obj, int optname, /* Set new SL for all flows */ ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO].path-> pr_sl = new_sl; -#ifdef PSM_OPA - ipsaddr->flows[EP_FLOW_GO_BACK_N_DMA].path-> - pr_sl = new_sl; -#endif } } break; diff --git a/psm3/utils/utils_dwordcpy-x86_64.c b/psm3/utils/utils_dwordcpy-x86_64.c index 9676249..6929bc2 100644 --- a/psm3/utils/utils_dwordcpy-x86_64.c +++ b/psm3/utils/utils_dwordcpy-x86_64.c @@ -165,133 +165,6 @@ void psm3_qwordcpy(volatile uint64_t *dest, const uint64_t *src, uint32_t nqword } } -#ifdef PSM_OPA -#ifdef PSM_AVX512 -void psm3_pio_blockcpy_512(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock) -{ - volatile __m512i *dp = (volatile __m512i *) dest; - const __m512i *sp = (const __m512i *) src; - - psmi_assert((dp != NULL) && (sp != NULL)); - psmi_assert((((uintptr_t) dp) & 0x3f) == 0x0); - - if ((((uintptr_t) sp) & 0x3f) == 0x0) { - /* source and destination are both 64 byte aligned */ - do { - __m512i tmp0 = _mm512_load_si512(sp); - _mm512_store_si512((__m512i *)dp, tmp0); - } while ((--nblock) && (++dp) && (++sp)); - } else { - /* only destination is 64 byte aligned - use unaligned loads */ - do { - __m512i tmp0 = _mm512_loadu_si512(sp); - _mm512_store_si512((__m512i *)dp, tmp0); - } while ((--nblock) && (++dp) && (++sp)); - } -} -#endif - -void psm3_pio_blockcpy_256(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock) -{ - volatile __m256i *dp = (volatile __m256i *) dest; - const __m256i *sp = (const __m256i *) src; - - psmi_assert((dp != NULL) && (sp != NULL)); - psmi_assert((((uintptr_t) dp) & 0x3f) == 0x0); - - if ((((uintptr_t) sp) & 0x1f) == 0x0) { - /* source and destination are both 32 byte aligned */ - do { - __m256i tmp0 = _mm256_load_si256(sp); - __m256i tmp1 = _mm256_load_si256(sp + 1); - _mm256_store_si256((__m256i *)dp, tmp0); - _mm256_store_si256((__m256i *)(dp + 1), tmp1); - } while ((--nblock) && (dp = dp+2) && (sp = sp+2)); - } else { - /* only destination is 32 byte aligned - use unaligned loads */ - do { - __m256i tmp0 = _mm256_loadu_si256(sp); - __m256i tmp1 = _mm256_loadu_si256(sp + 1); - _mm256_store_si256((__m256i *)dp, tmp0); - _mm256_store_si256((__m256i *)(dp + 1), tmp1); - } while ((--nblock) && (dp = dp+2) && (sp = sp+2)); - } -} - -void psm3_pio_blockcpy_128(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock) -{ - volatile __m128i *dp = (volatile __m128i *) dest; - const __m128i *sp = (const __m128i *) src; - - psmi_assert((dp != NULL) && (sp != NULL)); - psmi_assert((((uintptr_t) dp) & 0x3f) == 0x0); - - if ((((uintptr_t) sp) & 0xf) == 0x0) { - /* source and destination are both 16 byte aligned */ - do { - __m128i tmp0 = _mm_load_si128(sp); - __m128i tmp1 = _mm_load_si128(sp + 1); - __m128i tmp2 = _mm_load_si128(sp + 2); - __m128i tmp3 = _mm_load_si128(sp + 3); - _mm_store_si128((__m128i *)dp, tmp0); - _mm_store_si128((__m128i *)(dp + 1), tmp1); - _mm_store_si128((__m128i *)(dp + 2), tmp2); - _mm_store_si128((__m128i *)(dp + 3), tmp3); - } while ((--nblock) && (dp = dp+4) && (sp = sp+4)); - } else { - /* only destination is 16 byte aligned - use unaligned loads */ - do { - __m128i tmp0 = _mm_loadu_si128(sp); - __m128i tmp1 = _mm_loadu_si128(sp + 1); - __m128i tmp2 = _mm_loadu_si128(sp + 2); - __m128i tmp3 = _mm_loadu_si128(sp + 3); - _mm_store_si128((__m128i *)dp, tmp0); - _mm_store_si128((__m128i *)(dp + 1), tmp1); - _mm_store_si128((__m128i *)(dp + 2), tmp2); - _mm_store_si128((__m128i *)(dp + 3), tmp3); - } while ((--nblock) && (dp = dp+4) && (sp = sp+4)); - } -} - -void psm3_pio_blockcpy_64(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock) -{ - const uint64_t *src64[4]; - volatile uint64_t *dst64[4]; - src64[0] = src; - dst64[0] = dest; - - psmi_assert((dst64[0] != NULL) && (src64[0] != NULL)); - psmi_assert((((uintptr_t) dest) & 0x3f) == 0x0); - - do { - *dst64[0] = *src64[0]; - src64[1] = src64[0] + 1; - src64[2] = src64[0] + 2; - src64[3] = src64[0] + 3; - dst64[1] = dst64[0] + 1; - dst64[2] = dst64[0] + 2; - dst64[3] = dst64[0] + 3; - *dst64[1] = *src64[1]; - *dst64[2] = *src64[2]; - *dst64[3] = *src64[3]; - src64[0] += 4; - dst64[0] += 4; - - *dst64[0] = *src64[0]; - src64[1] = src64[0] + 1; - src64[2] = src64[0] + 2; - src64[3] = src64[0] + 3; - dst64[1] = dst64[0] + 1; - dst64[2] = dst64[0] + 2; - dst64[3] = dst64[0] + 3; - *dst64[1] = *src64[1]; - *dst64[2] = *src64[2]; - *dst64[3] = *src64[3]; - src64[0] += 4; - dst64[0] += 4; - } while (--nblock); -} -#endif /* PSM_OPA */ void MOCKABLE(psm3_mq_mtucpy)(void *vdest, const void *vsrc, uint32_t nchars) { diff --git a/psm3/utils/utils_sysfs.c b/psm3/utils/utils_sysfs.c index d25701a..5b01e9a 100644 --- a/psm3/utils/utils_sysfs.c +++ b/psm3/utils/utils_sysfs.c @@ -85,9 +85,6 @@ static char psm3_sysfs_paths[PSMI_MAX_RAILS][PATH_MAX]; static psm3_port_path_type psm3_sysfs_port_path_fmt; static int psm3_sysfs_path_count = -1; static long psm3_sysfs_page_size; -#ifdef PSM_OPA -static char *hfifs_path; -#endif static int filter_dir(const struct dirent *item) { if (item->d_name[0] == '.') return 0; @@ -143,12 +140,6 @@ int psm3_sysfs_init(const char *nic_class_path, const psm3_port_path_type port_p // for psm3_sysfs_port_open construction of path to port attr psm3_sysfs_port_path_fmt = port_path_fmt; -#ifdef PSM_OPA - if (hfifs_path == NULL) - hfifs_path = getenv("PSM3_HFIFS_PATH"); - if (hfifs_path == NULL) - hfifs_path = "/hfifs"; -#endif if (!psm3_sysfs_page_size) psm3_sysfs_page_size = sysconf(_SC_PAGESIZE); @@ -222,32 +213,6 @@ int psm3_sysfs_find_unit(const char *name) return -1; } -#ifdef PSM_OPA -const char *psm3_hfifs_path(void) -{ - return hfifs_path; -} - -static int psm3_hfifs_open(const char *attr, int flags) -{ - char buf[1024]; - int saved_errno; - int fd; - - snprintf(buf, sizeof(buf), "%s/%s", psm3_hfifs_path(), attr); - fd = open(buf, flags); - saved_errno = errno; - - if (fd == -1) { - _HFI_DBG("Failed to open driver attribute '%s': %s\n", attr, - strerror(errno)); - _HFI_DBG("Offending file name: %s\n", buf); - } - - errno = saved_errno; - return fd; -} -#endif // PSM_OPA static int psm3_sysfs_unit_open(uint32_t unit, const char *attr, int flags) { @@ -346,27 +311,6 @@ static int psm3_sysfs_port_open(uint32_t unit, uint32_t port, const char *attr, return fd; } -#ifdef PSM_OPA -static int psm3_hfifs_unit_open(uint32_t unit, const char *attr, int flags) -{ - int saved_errno; - char buf[1024]; - int fd; - - snprintf(buf, sizeof(buf), "%s/%u/%s", psm3_hfifs_path(), unit, attr); - fd = open(buf, flags); - saved_errno = errno; - - if (fd == -1) { - _HFI_DBG("Failed to open attribute '%s' of unit %d: %s\n", attr, - unit, strerror(errno)); - _HFI_DBG("Offending file name: %s\n", buf); - } - - errno = saved_errno; - return fd; -} -#endif // PSM_OPA static int read_page(int fd, char **datap) { @@ -497,124 +441,6 @@ int psm3_sysfs_port_read(uint32_t unit, uint32_t port, const char *attr, return ret; } -#ifdef PSM_OPA - -/* free data allocated by read_page or any of the other hfifs functions in this - * file which use it - */ -void psm3_hfifs_free(char *data) -{ - psm3_sysfs_free(data); -} - -/* - * On return, caller must free *datap via psm3_hfifs_free - */ -int psm3_hfifs_read(const char *attr, char **datap) -{ - int fd = -1, ret = -1; - int saved_errno; - - fd = psm3_hfifs_open(attr, O_RDONLY); - saved_errno = errno; - - if (fd == -1) - goto bail; - - ret = read_page(fd, datap); - saved_errno = errno; - -bail: - if (ret == -1) - *datap = NULL; - - if (fd != -1) { - close(fd); - } - - errno = saved_errno; - return ret; -} - -/* - * On return, caller must free *datap via psm3_hfifs_free - */ -int psm3_hfifs_unit_read(uint32_t unit, const char *attr, char **datap) -{ - int fd = -1, ret = -1; - int saved_errno; - - fd = psm3_hfifs_unit_open(unit, attr, O_RDONLY); - saved_errno = errno; - - if (fd == -1) - goto bail; - - ret = read_page(fd, datap); - saved_errno = errno; - -bail: - if (ret == -1) - *datap = NULL; - - if (fd != -1) { - close(fd); - } - - errno = saved_errno; - return ret; -} - -/* - * The _rd routines jread directly into a supplied buffer, - * unlike the _read routines. - */ -int psm3_hfifs_rd(const char *attr, void *buf, int n) -{ - int fd = -1, ret = -1; - int saved_errno; - - fd = psm3_hfifs_open(attr, O_RDONLY); - saved_errno = errno; - - if (fd == -1) - goto bail; - - ret = read(fd, buf, n); - saved_errno = errno; - -bail: - if (fd != -1) { - close(fd); - } - - errno = saved_errno; - return ret; -} - -int psm3_hfifs_unit_rd(uint32_t unit, const char *attr, void *buf, int n) -{ - int fd = -1, ret = -1; - int saved_errno; - - fd = psm3_hfifs_unit_open(unit, attr, O_RDONLY); - saved_errno = errno; - - if (fd == -1) - goto bail; - - ret = read(fd, buf, n); - saved_errno = errno; - -bail: - if (fd != -1) { - close(fd); - } - - errno = saved_errno; - return ret; -} -#endif // PSM_OPA int psm3_sysfs_unit_read_s64(uint32_t unit, const char *attr, int64_t *valp, int base) diff --git a/shared/abi_1_0.c b/shared/abi_1_0.c deleted file mode 100644 index 34d8e60..0000000 --- a/shared/abi_1_0.c +++ /dev/null @@ -1,453 +0,0 @@ -/* - * Copyright (c) 2016-2018 Intel Corporation. All rights reserved. - * Copyright (c) 2017, Cisco Systems, Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "config.h" - -#include -#include -#include - -#include -#include -#include - - -/* - * The conversion from abi 1.0 requires being able to cast from a newer - * structure back to the older version. - */ -struct fi_fabric_attr_1_0 { - struct fid_fabric *fabric; - char *name; - char *prov_name; - uint32_t prov_version; -}; - -struct fi_domain_attr_1_0 { - struct fid_domain *domain; - char *name; - enum fi_threading threading; - enum fi_progress control_progress; - enum fi_progress data_progress; - enum fi_resource_mgmt resource_mgmt; - enum fi_av_type av_type; - enum fi_mr_mode mr_mode; - size_t mr_key_size; - size_t cq_data_size; - size_t cq_cnt; - size_t ep_cnt; - size_t tx_ctx_cnt; - size_t rx_ctx_cnt; - size_t max_ep_tx_ctx; - size_t max_ep_rx_ctx; - size_t max_ep_stx_ctx; - size_t max_ep_srx_ctx; -}; - -struct fi_ep_attr_1_0 { - enum fi_ep_type type; - uint32_t protocol; - uint32_t protocol_version; - size_t max_msg_size; - size_t msg_prefix_size; - size_t max_order_raw_size; - size_t max_order_war_size; - size_t max_order_waw_size; - uint64_t mem_tag_format; - size_t tx_ctx_cnt; - size_t rx_ctx_cnt; -}; - -struct fi_tx_attr_1_0 { - uint64_t caps; - uint64_t mode; - uint64_t op_flags; - uint64_t msg_order; - uint64_t comp_order; - size_t inject_size; - size_t size; - size_t iov_limit; - size_t rma_iov_limit; -}; - -/* External structure is still ABI 1.0 compliant */ -#define fi_rx_attr_1_0 fi_rx_attr - -struct fi_info_1_0 { - struct fi_info *next; - uint64_t caps; - uint64_t mode; - uint32_t addr_format; - size_t src_addrlen; - size_t dest_addrlen; - void *src_addr; - void *dest_addr; - fid_t handle; - struct fi_tx_attr_1_0 *tx_attr; - struct fi_rx_attr_1_0 *rx_attr; - struct fi_ep_attr_1_0 *ep_attr; - struct fi_domain_attr_1_0 *domain_attr; - struct fi_fabric_attr_1_0 *fabric_attr; -}; - -struct fi_domain_attr_1_1 { - struct fid_domain *domain; - char *name; - enum fi_threading threading; - enum fi_progress control_progress; - enum fi_progress data_progress; - enum fi_resource_mgmt resource_mgmt; - enum fi_av_type av_type; - int mr_mode; - size_t mr_key_size; - size_t cq_data_size; - size_t cq_cnt; - size_t ep_cnt; - size_t tx_ctx_cnt; - size_t rx_ctx_cnt; - size_t max_ep_tx_ctx; - size_t max_ep_rx_ctx; - size_t max_ep_stx_ctx; - size_t max_ep_srx_ctx; - size_t cntr_cnt; - size_t mr_iov_limit; - uint64_t caps; - uint64_t mode; - uint8_t *auth_key; - size_t auth_key_size; - size_t max_err_data; - size_t mr_cnt; -}; - -#define fi_tx_attr_1_1 fi_tx_attr_1_0 -#define fi_rx_attr_1_1 fi_rx_attr_1_0 -#define fi_ep_attr_1_1 fi_ep_attr -#define fi_fabric_attr_1_1 fi_fabric_attr - -struct fi_info_1_1 { - struct fi_info *next; - uint64_t caps; - uint64_t mode; - uint32_t addr_format; - size_t src_addrlen; - size_t dest_addrlen; - void *src_addr; - void *dest_addr; - fid_t handle; - struct fi_tx_attr_1_1 *tx_attr; - struct fi_rx_attr_1_1 *rx_attr; - struct fi_ep_attr_1_1 *ep_attr; - struct fi_domain_attr_1_1 *domain_attr; - struct fi_fabric_attr_1_1 *fabric_attr; -}; - -#define fi_tx_attr_1_2 fi_tx_attr_1_1 -#define fi_rx_attr_1_2 fi_rx_attr_1_1 -#define fi_ep_attr_1_2 fi_ep_attr_1_1 -#define fi_domain_attr_1_2 fi_domain_attr_1_1 -#define fi_fabric_attr_1_2 fi_fabric_attr_1_1 -#define fid_nic_1_2 fid_nic - -struct fi_info_1_2 { - struct fi_info *next; - uint64_t caps; - uint64_t mode; - uint32_t addr_format; - size_t src_addrlen; - size_t dest_addrlen; - void *src_addr; - void *dest_addr; - fid_t handle; - struct fi_tx_attr_1_2 *tx_attr; - struct fi_rx_attr_1_2 *rx_attr; - struct fi_ep_attr_1_2 *ep_attr; - struct fi_domain_attr_1_2 *domain_attr; - struct fi_fabric_attr_1_2 *fabric_attr; - struct fid_nic_1_2 *nic; -}; - -/* -#define fi_tx_attr_1_3 fi_tx_attr -#define fi_rx_attr_1_3 fi_rx_attr_1_2 -#define fi_ep_attr_1_3 fi_ep_attr_1_2 -#define fi_domain_attr_1_3 fi_domain_attr -#define fi_fabric_attr_1_3 fi_fabric_attr_1_2 -fi_info_1_3 -> fi_info -*/ - -#define ofi_dup_attr(dst, src) \ - do { \ - dst = calloc(1, sizeof(*dst)); \ - if (dst) \ - memcpy(dst, src, sizeof(*src)); \ - } while (0); - - -/* - * ABI 1.0 - */ -__attribute__((visibility ("default"),EXTERNALLY_VISIBLE)) -void fi_freeinfo_1_0(struct fi_info_1_0 *info) -{ - fi_freeinfo((struct fi_info *) info); -} -COMPAT_SYMVER(fi_freeinfo_1_0, fi_freeinfo, FABRIC_1.0); - -__attribute__((visibility ("default"),EXTERNALLY_VISIBLE)) -struct fi_info_1_0 *fi_dupinfo_1_0(const struct fi_info_1_0 *info) -{ - struct fi_info *dup; - - if (!info) - return (struct fi_info_1_0 *) ofi_allocinfo_internal(); - - ofi_dup_attr(dup, info); - if (dup == NULL) { - return NULL; - } - dup->src_addr = NULL; - dup->dest_addr = NULL; - dup->tx_attr = NULL; - dup->rx_attr = NULL; - dup->ep_attr = NULL; - dup->domain_attr = NULL; - dup->fabric_attr = NULL; - dup->next = NULL; - - if (info->src_addr != NULL) { - dup->src_addr = mem_dup(info->src_addr, info->src_addrlen); - if (dup->src_addr == NULL) - goto fail; - } - if (info->dest_addr != NULL) { - dup->dest_addr = mem_dup(info->dest_addr, info->dest_addrlen); - if (dup->dest_addr == NULL) - goto fail; - } - if (info->tx_attr != NULL) { - ofi_dup_attr(dup->tx_attr, info->tx_attr); - if (dup->tx_attr == NULL) - goto fail; - } - if (info->rx_attr != NULL) { - ofi_dup_attr(dup->rx_attr, info->rx_attr); - if (dup->rx_attr == NULL) - goto fail; - } - if (info->ep_attr != NULL) { - ofi_dup_attr(dup->ep_attr, info->ep_attr); - if (dup->ep_attr == NULL) - goto fail; - } - if (info->domain_attr) { - ofi_dup_attr(dup->domain_attr, info->domain_attr); - if (dup->domain_attr == NULL) - goto fail; - if (info->domain_attr->name != NULL) { - dup->domain_attr->name = strdup(info->domain_attr->name); - if (dup->domain_attr->name == NULL) - goto fail; - } - } - if (info->fabric_attr) { - ofi_dup_attr(dup->fabric_attr, info->fabric_attr); - if (dup->fabric_attr == NULL) - goto fail; - dup->fabric_attr->name = NULL; - dup->fabric_attr->prov_name = NULL; - if (info->fabric_attr->name != NULL) { - dup->fabric_attr->name = strdup(info->fabric_attr->name); - if (dup->fabric_attr->name == NULL) - goto fail; - } - if (info->fabric_attr->prov_name != NULL) { - dup->fabric_attr->prov_name = strdup(info->fabric_attr->prov_name); - if (dup->fabric_attr->prov_name == NULL) - goto fail; - } - } - return (struct fi_info_1_0 *) dup; - -fail: - fi_freeinfo(dup); - return NULL; -} -COMPAT_SYMVER(fi_dupinfo_1_0, fi_dupinfo, FABRIC_1.0); - -__attribute__((visibility ("default"),EXTERNALLY_VISIBLE)) -int fi_getinfo_1_0(uint32_t version, const char *node, const char *service, - uint64_t flags, const struct fi_info_1_0 *hints_1_0, - struct fi_info_1_0 **info) -{ - struct fi_info *hints; - int ret; - - if (hints_1_0) { - hints = (struct fi_info *)fi_dupinfo_1_0(hints_1_0); - if (!hints) - return -FI_ENOMEM; - } else { - hints = NULL; - } - ret = fi_getinfo(version, node, service, flags, hints, - (struct fi_info **)info); - fi_freeinfo(hints); - - return ret; -} -COMPAT_SYMVER(fi_getinfo_1_0, fi_getinfo, FABRIC_1.0); - -__attribute__((visibility ("default"),EXTERNALLY_VISIBLE)) -int fi_fabric_1_0(struct fi_fabric_attr_1_0 *attr_1_0, - struct fid_fabric **fabric, void *context) -{ - struct fi_fabric_attr attr; - - if (!attr_1_0) - return -FI_EINVAL; - - memcpy(&attr, attr_1_0, sizeof(*attr_1_0)); - - /* Since the API version is not available in ABI 1.0, set the field to - * FI_VERSION(1, 0) for compatibility. The actual API version could be - * anywhere from FI_VERSION(1, 0) to FI_VERSION(1, 4). - */ - attr.api_version = FI_VERSION(1, 0); - return fi_fabric(&attr, fabric, context); -} -COMPAT_SYMVER(fi_fabric_1_0, fi_fabric, FABRIC_1.0); - - -/* - * ABI 1.1 - */ -__attribute__((visibility ("default"),EXTERNALLY_VISIBLE)) -void fi_freeinfo_1_1(struct fi_info_1_1 *info) -{ - fi_freeinfo((struct fi_info *) info); -} -COMPAT_SYMVER(fi_freeinfo_1_1, fi_freeinfo, FABRIC_1.1); - -__attribute__((visibility ("default"),EXTERNALLY_VISIBLE)) -struct fi_info_1_1 *fi_dupinfo_1_1(const struct fi_info_1_1 *info) -{ - struct fi_info *dup, *base; - - if (!info) - return (struct fi_info_1_1 *) ofi_allocinfo_internal(); - - ofi_dup_attr(base, info); - if (base == NULL) - return NULL; - - dup = fi_dupinfo(base); - - free(base); - return (struct fi_info_1_1 *) dup; -} -COMPAT_SYMVER(fi_dupinfo_1_1, fi_dupinfo, FABRIC_1.1); - -__attribute__((visibility ("default"),EXTERNALLY_VISIBLE)) -int fi_getinfo_1_1(uint32_t version, const char *node, const char *service, - uint64_t flags, const struct fi_info_1_1 *hints_1_1, - struct fi_info_1_1 **info) -{ - struct fi_info *hints; - int ret; - - if (hints_1_1) { - hints = (struct fi_info *) fi_dupinfo_1_1(hints_1_1); - if (!hints) - return -FI_ENOMEM; - } else { - hints = NULL; - } - ret = fi_getinfo(version, node, service, flags, hints, - (struct fi_info **) info); - fi_freeinfo(hints); - - return ret; -} -COMPAT_SYMVER(fi_getinfo_1_1, fi_getinfo, FABRIC_1.1); - -/* - * ABI 1.2 - */ -__attribute__((visibility ("default"),EXTERNALLY_VISIBLE)) -void fi_freeinfo_1_2(struct fi_info_1_2 *info) -{ - fi_freeinfo((struct fi_info *) info); -} -COMPAT_SYMVER(fi_freeinfo_1_2, fi_freeinfo, FABRIC_1.2); - -__attribute__((visibility ("default"),EXTERNALLY_VISIBLE)) -struct fi_info_1_2 *fi_dupinfo_1_2(const struct fi_info_1_2 *info) -{ - struct fi_info *dup, *base; - - if (!info) - return (struct fi_info_1_2 *) ofi_allocinfo_internal(); - - ofi_dup_attr(base, info); - if (base == NULL) - return NULL; - - dup = fi_dupinfo(base); - - free(base); - return (struct fi_info_1_2 *) dup; -} -COMPAT_SYMVER(fi_dupinfo_1_2, fi_dupinfo, FABRIC_1.2); - -__attribute__((visibility ("default"),EXTERNALLY_VISIBLE)) -int fi_getinfo_1_2(uint32_t version, const char *node, const char *service, - uint64_t flags, const struct fi_info_1_2 *hints_1_2, - struct fi_info_1_2 **info) -{ - struct fi_info *hints; - int ret; - - if (hints_1_2) { - hints = (struct fi_info *) fi_dupinfo_1_2(hints_1_2); - if (!hints) - return -FI_ENOMEM; - } else { - hints = NULL; - } - ret = fi_getinfo(version, node, service, flags, hints, - (struct fi_info **) info); - fi_freeinfo(hints); - - return ret; -} -COMPAT_SYMVER(fi_getinfo_1_2, fi_getinfo, FABRIC_1.2); diff --git a/shared/fabric.c b/shared/fabric.c deleted file mode 100644 index 8347dc3..0000000 --- a/shared/fabric.c +++ /dev/null @@ -1,1406 +0,0 @@ -/* - * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. - * Copyright (c) 2006-2016 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2013-2017 Intel Corp., Inc. All rights reserved. - * (C) Copyright 2020 Hewlett Packard Enterprise Development LP - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "config.h" - -#include -#include -#include -#include -#include -#include - -#include -#include "ofi_util.h" -#include "ofi.h" -#include "shared/ofi_str.h" -#include "ofi_prov.h" -#include "ofi_perf.h" -#include "ofi_hmem.h" -#include "rdma/fi_ext.h" - -#ifdef HAVE_LIBDL -#include -#endif - - -struct ofi_prov { - struct ofi_prov *next; - char *prov_name; - struct fi_provider *provider; - void *dlhandle; - bool hidden; -}; - -enum ofi_prov_order { - OFI_PROV_ORDER_VERSION, - OFI_PROV_ORDER_REGISTER, -}; - -static struct ofi_prov *prov_head, *prov_tail; -static enum ofi_prov_order prov_order = OFI_PROV_ORDER_VERSION; -int ofi_init = 0; -extern struct ofi_common_locks common_locks; - -static struct fi_filter prov_filter; - - -static struct ofi_prov * -ofi_alloc_prov(const char *prov_name) -{ - struct ofi_prov *prov; - - prov = calloc(sizeof *prov, 1); - if (!prov) - return NULL; - - prov->prov_name = strdup(prov_name); - if (!prov->prov_name) { - free(prov); - return NULL; - } - - return prov; -} - -static void -ofi_init_prov(struct ofi_prov *prov, struct fi_provider *provider, - void *dlhandle) -{ - prov->provider = provider; - prov->dlhandle = dlhandle; -} - -static void ofi_cleanup_prov(struct fi_provider *provider, void *dlhandle) -{ - if (provider) { - fi_param_undefine(provider); - if (provider->cleanup) - provider->cleanup(); - } - -#ifdef HAVE_LIBDL - if (dlhandle) - dlclose(dlhandle); -#else - OFI_UNUSED(dlhandle); -#endif -} - -static void ofi_free_prov(struct ofi_prov *prov) -{ - ofi_cleanup_prov(prov->provider, prov->dlhandle); - free(prov->prov_name); - free(prov); -} - -static void ofi_insert_prov(struct ofi_prov *prov) -{ - struct ofi_prov *cur, *prev; - - for (prev = NULL, cur = prov_head; cur; prev = cur, cur = cur->next) { - if ((strlen(prov->prov_name) == strlen(cur->prov_name)) && - !strcasecmp(prov->prov_name, cur->prov_name)) { - if ((prov_order == OFI_PROV_ORDER_VERSION) && - FI_VERSION_LT(cur->provider->version, - prov->provider->version)) { - cur->hidden = true; - prov->next = cur; - if (prev) - prev->next = prov; - else - prov_head = prov; - } else { - prov->hidden = true; - prov->next = cur->next; - cur->next = prov; - if (prov_tail == cur) - prov_tail = prov; - } - return; - } - } - - if (prov_tail) - prov_tail->next = prov; - else - prov_head = prov; - prov_tail = prov; -} - -static int ofi_find_name(char **names, const char *name) -{ - int i; - - for (i = 0; names[i]; i++) { - if (!strcasecmp(name, names[i])) - return i; - } - return -1; -} - -/* matches if names[i] == "xxx;yyy" and name == "xxx" */ -static int ofi_find_layered_name(char **names, const char *name) -{ - int i; - size_t len; - - len = strlen(name); - for (i = 0; names[i]; i++) { - if (!strncasecmp(name, names[i], len) && names[i][len] == ';' ) - return i; - } - return -1; -} - -/* matches if names[i] == "xxx" and name == "xxx;yyy" */ -static int ofi_find_core_name(char **names, const char *name) -{ - int i; - size_t len; - - for (i = 0; names[i]; i++) { - len = strlen(names[i]); - if (!strncasecmp(name, names[i], len) && name[len] == ';' ) - return i; - } - return -1; -} - -static void ofi_closest_prov_names(char *prov_name, char* miss_prov_name, int n) -{ - if (strncasecmp( prov_name, miss_prov_name, n ) == 0 ) { - FI_WARN(&core_prov, FI_LOG_CORE, - "Instead misspelled provider: %s, you may want: %s?\n", - miss_prov_name, prov_name); - } -} - -static void ofi_suggest_prov_names(char *name_to_match) -{ - struct ofi_prov *prov; - for (prov = prov_head; prov; prov = prov->next) { - if (strlen(prov->prov_name) != strlen(name_to_match) - && !strncasecmp(prov->prov_name, name_to_match, - strlen(name_to_match))) { - if (strlen(name_to_match) > 5) - ofi_closest_prov_names(prov->prov_name, - name_to_match, 5); - else - ofi_closest_prov_names(prov->prov_name, - name_to_match, 2); - } - } -} - -static enum ofi_prov_type ofi_prov_type(const struct fi_provider *provider) -{ - const struct fi_prov_context *ctx; - ctx = (const struct fi_prov_context *) &provider->context; - return ctx->type; -} - -static int ofi_disable_util_layering(const struct fi_provider *provider) { - const struct fi_prov_context *ctx; - - ctx = (const struct fi_prov_context *) &provider->context; - return ctx->disable_layering; -} - -static int ofi_is_util_prov(const struct fi_provider *provider) -{ - return ofi_prov_type(provider) == OFI_PROV_UTIL; -} - -static int ofi_is_core_prov(const struct fi_provider *provider) -{ - return ofi_prov_type(provider) == OFI_PROV_CORE; -} - -static int ofi_is_hook_prov(const struct fi_provider *provider) -{ - return ofi_prov_type(provider) == OFI_PROV_HOOK; -} - -int ofi_apply_filter(struct fi_filter *filter, const char *name) -{ - if (!filter->names) - return 0; - - if (ofi_find_name(filter->names, name) >= 0) - return filter->negated ? 1 : 0; - - return filter->negated ? 0 : 1; -} - -/* - * The provider init filter is used to filter out unnecessary core providers - * at the initialization time. Utility providers are not concerned. - * - * Special handling is needed for layered provider names: - * - * If the filter is not negated, a name "xxx;yyy" in the filter should match - * input "xxx" to ensure that the core provider "xxx" is included. - * - * If the filter is negated, a name "xxx;yyy" in the filter should not match - * input "xxx" otherwise the core provider "xxx" may be incorrectly filtered - * out. - */ -int ofi_apply_prov_init_filter(struct fi_filter *filter, const char *name) -{ - if (!filter->names) - return 0; - - if (ofi_find_name(filter->names, name) >= 0) - return filter->negated ? 1 : 0; - - if (filter->negated) - return 0; - - if (ofi_find_layered_name(filter->names, name) >= 0) - return 0; - - return 1; -} - -/* - * The provider post filter is used to remove unwanted entries from the fi_info - * list before returning from fi_getinfo(). - * - * Layered provider names are handled in the same way as non-layered provider - * names -- requiring full match. - * - * In addition, a name "xxx" in the filter should be able to match an input - * "xxx;yyy" to allow extra layering on top of what is requested by the user. - */ -int ofi_apply_prov_post_filter(struct fi_filter *filter, const char *name) -{ - if (!filter->names) - return 0; - - if (ofi_find_name(filter->names, name) >= 0 || - ofi_find_core_name(filter->names, name) >= 0) - return filter->negated ? 1 : 0; - - return filter->negated ? 0 : 1; -} - -static int ofi_getinfo_filter(const struct fi_provider *provider) -{ - /* Positive filters only apply to core providers. They must be - * explicitly enabled by the filter. Other providers (i.e. utility) - * are automatically enabled in this case, so that they can work - * over any enabled core filter. Negative filters may be used - * to disable any provider. - */ - if (!prov_filter.negated && !ofi_is_core_prov(provider)) - return 0; - - return ofi_apply_prov_init_filter(&prov_filter, provider->name); -} - -static void ofi_filter_info(struct fi_info **info) -{ - struct fi_info *cur, *prev, *tmp; - - if (!prov_filter.names) - return; - - prev = NULL; - cur = *info; - while (cur) { - assert(cur->fabric_attr && cur->fabric_attr->prov_name); - - if (ofi_apply_prov_post_filter(&prov_filter, cur->fabric_attr->prov_name)) { - tmp = cur; - cur = cur->next; - if (prev) - prev->next = cur; - else - *info = cur; - tmp->next = NULL; - fi_freeinfo(tmp); - } else { - prev = cur; - cur = cur->next; - } - } -} - -static struct ofi_prov *ofi_getprov(const char *prov_name, size_t len) -{ - struct ofi_prov *prov; - - for (prov = prov_head; prov; prov = prov->next) { - if ((strlen(prov->prov_name) == len) && - !strncasecmp(prov->prov_name, prov_name, len)) - return prov; - } - - return NULL; -} - -static struct fi_provider *ofi_get_hook(const char *name) -{ - struct ofi_prov *prov; - struct fi_provider *provider = NULL; - char *try_name = NULL; - int ret; - - prov = ofi_getprov(name, strlen(name)); - if (!prov) { - ret = asprintf(&try_name, "ofi_hook_%s", name); - if (ret > 0) - prov = ofi_getprov(try_name, ret); - else - try_name = NULL; - } - - if (prov) { - if (prov->provider && ofi_is_hook_prov(prov->provider)) { - provider = prov->provider; - } else { - FI_WARN(&core_prov, FI_LOG_CORE, - "Specified provider is not a hook: %s\n", name); - } - } else { - FI_WARN(&core_prov, FI_LOG_CORE, - "No hook found for: %s\n", name); - } - - free(try_name); - return provider; -} - -/* This is the default order that providers will be reported when a provider - * is available. Initialize the socket(s) provider last. This will result in - * it being the least preferred provider. - */ -static void ofi_ordered_provs_init(void) -{ - char *ordered_prov_names[] = { - "efa", "opx", "psm2", "psm", "usnic", "gni", "bgq", "verbs", - "netdir", "psm3", "ofi_rxm", "ofi_rxd", "shm", - /* Initialize the socket based providers last of the - * standard providers. This will result in them being - * the least preferred providers. - */ - - /* Before you add ANYTHING here, read the comment above!!! */ - "udp", "tcp", "sockets", /* NOTHING GOES HERE! */ - /* Seriously, read it! */ - - /* These are hooking providers only. Their order - * doesn't matter - */ - "ofi_hook_perf", "ofi_hook_debug", "ofi_hook_noop", "ofi_hook_hmem", - "ofi_hook_dmabuf_peer_mem", - }; - struct ofi_prov *prov; - int num_provs, i; - - num_provs = sizeof(ordered_prov_names) / sizeof(ordered_prov_names[0]); - - for (i = 0; i < num_provs; i++) { - prov = ofi_alloc_prov(ordered_prov_names[i]); - if (prov) - ofi_insert_prov(prov); - } -} - -static void ofi_set_prov_type(struct fi_prov_context *ctx, - struct fi_provider *provider) -{ - if (!provider->getinfo) - ctx->type = OFI_PROV_HOOK; - else if (ofi_has_util_prefix(provider->name)) - ctx->type = OFI_PROV_UTIL; - else - ctx->type = OFI_PROV_CORE; -} - -static void ofi_register_provider(struct fi_provider *provider, void *dlhandle) -{ - struct fi_prov_context *ctx; - struct ofi_prov *prov = NULL; - bool hidden = false; - - if (!provider || !provider->name) { - FI_DBG(&core_prov, FI_LOG_CORE, - "no provider structure or name\n"); - goto cleanup; - } - - FI_INFO(&core_prov, FI_LOG_CORE, - "registering provider: %s (%d.%d)\n", provider->name, - FI_MAJOR(provider->version), FI_MINOR(provider->version)); - - if (!provider->fabric) { - FI_WARN(&core_prov, FI_LOG_CORE, - "provider missing mandatory entry points\n"); - goto cleanup; - } - - /* The current core implementation is not backward compatible - * with providers that support a release earlier than v1.3. - * See commit 0f4b6651. - */ - if (provider->fi_version < FI_VERSION(1, 3)) { - FI_INFO(&core_prov, FI_LOG_CORE, - "provider has unsupported FI version " - "(provider %d.%d != libfabric %d.%d); ignoring\n", - FI_MAJOR(provider->fi_version), - FI_MINOR(provider->fi_version), FI_MAJOR_VERSION, - FI_MINOR_VERSION); - goto cleanup; - } - - ctx = (struct fi_prov_context *) &provider->context; - ofi_set_prov_type(ctx, provider); - - if (ofi_getinfo_filter(provider)) { - FI_INFO(&core_prov, FI_LOG_CORE, - "\"%s\" filtered by provider include/exclude " - "list, skipping\n", provider->name); - hidden = true; - } - - if (ofi_apply_filter(&prov_log_filter, provider->name)) - ctx->disable_logging = 1; - - /* - * Prevent utility providers from layering on these core providers - * unless explicitly requested. - */ - if (!strcasecmp(provider->name, "sockets") || - !strcasecmp(provider->name, "shm") || - !strcasecmp(provider->name, "efa") || - !strcasecmp(provider->name, "psm3") || ofi_is_util_prov(provider)) - ctx->disable_layering = 1; - - prov = ofi_getprov(provider->name, strlen(provider->name)); - if (prov && !prov->provider) { - ofi_init_prov(prov, provider, dlhandle); - } else { - prov = ofi_alloc_prov(provider->name); - if (!prov) - goto cleanup; - - ofi_init_prov(prov, provider, dlhandle); - ofi_insert_prov(prov); - } - - if (hidden) - prov->hidden = true; - return; - -cleanup: - ofi_cleanup_prov(provider, dlhandle); -} - -#ifdef HAVE_LIBDL -static int lib_filter(const struct dirent *entry) -{ - size_t l = strlen(entry->d_name); - size_t sfx = sizeof (FI_LIB_SUFFIX) - 1; - - if (l > sfx) - return !strcmp(&(entry->d_name[l-sfx]), FI_LIB_SUFFIX); - else - return 0; -} -#endif - -static int verify_filter_names(char **names) -{ - int i, j; - char** split_names; - for (i = 0; names[i]; i++) { - split_names = ofi_split_and_alloc(names[i], ";", NULL); - if (!split_names) { - FI_WARN(&core_prov, FI_LOG_CORE, - "unable to parse given filter string\n"); - return -FI_ENODATA; - } - - for(j = 0; split_names[j]; j++) { - if(!ofi_getprov(split_names[j], strlen(split_names[j]))) { - FI_WARN(&core_prov, FI_LOG_CORE, - "provider %s is unknown, misspelled" - " or DL provider?\n", split_names[j]); - ofi_suggest_prov_names(split_names[j]); - } - } - ofi_free_string_array(split_names); - } - - return FI_SUCCESS; -} - -void ofi_free_filter(struct fi_filter *filter) -{ - ofi_free_string_array(filter->names); -} - -void ofi_create_filter(struct fi_filter *filter, const char *raw_filter) -{ - memset(filter, 0, sizeof *filter); - if (raw_filter == NULL) - return; - - if (*raw_filter == '^') { - filter->negated = 1; - ++raw_filter; - } - - filter->names = ofi_split_and_alloc(raw_filter, ",", NULL); - if (!filter->names) { - FI_WARN(&core_prov, FI_LOG_CORE, - "unable to parse filter from: %s\n", raw_filter); - return; - } - - if (verify_filter_names(filter->names)) - FI_WARN(&core_prov, FI_LOG_CORE, - "unable to verify filter name\n"); -} - -#ifdef HAVE_LIBDL -static void ofi_reg_dl_prov(const char *lib) -{ - void *dlhandle; - struct fi_provider* (*inif)(void); - - FI_DBG(&core_prov, FI_LOG_CORE, "opening provider lib %s\n", lib); - - dlhandle = dlopen(lib, RTLD_NOW); - if (dlhandle == NULL) { - FI_DBG(&core_prov, FI_LOG_CORE, - "dlopen(%s): %s\n", lib, dlerror()); - return; - } - - inif = dlsym(dlhandle, "fi_prov_ini"); - if (inif == NULL) { - FI_WARN(&core_prov, FI_LOG_CORE, "dlsym: %s\n", dlerror()); - dlclose(dlhandle); - } else { - ofi_register_provider((inif)(), dlhandle); - } -} - -static void ofi_ini_dir(const char *dir) -{ - int n; - char *lib; - struct dirent **liblist = NULL; - - n = scandir(dir, &liblist, lib_filter, alphasort); - if (n < 0) - goto libdl_done; - - while (n--) { - if (asprintf(&lib, "%s/%s", dir, liblist[n]->d_name) < 0) { - FI_WARN(&core_prov, FI_LOG_CORE, - "asprintf failed to allocate memory\n"); - goto libdl_done; - } - ofi_reg_dl_prov(lib); - - free(liblist[n]); - free(lib); - } - -libdl_done: - while (n-- > 0) - free(liblist[n]); - free(liblist); -} - -/* Search standard system library paths (i.e. LD_LIBRARY_PATH) for DLLs for - * known providers. - */ -static void ofi_find_prov_libs(void) -{ - const char* lib_prefix = "lib"; - struct ofi_prov *prov; - char* lib; - char* short_prov_name; - - for (prov = prov_head; prov; prov = prov->next) { - if (!prov->prov_name) - continue; - - if (ofi_has_util_prefix(prov->prov_name)) { - short_prov_name = prov->prov_name + strlen(OFI_UTIL_PREFIX); - } else { - short_prov_name = prov->prov_name; - } - - if (asprintf(&lib, "%s%s%s%s", lib_prefix, - short_prov_name, "-", FI_LIB_SUFFIX) < 0) { - FI_WARN(&core_prov, FI_LOG_CORE, - "asprintf failed to allocate memory\n"); - continue; - } - - ofi_reg_dl_prov(lib); - free(lib); - } -} - -static void ofi_load_dl_prov(void) -{ - char **dirs; - char *provdir = NULL; - void *dlhandle; - int i; - - /* If dlopen fails, assume static linking and return */ - dlhandle = dlopen(NULL, RTLD_NOW); - if (!dlhandle) - return; - dlclose(dlhandle); - - fi_param_define(NULL, "provider_path", FI_PARAM_STRING, - "Search for providers in specific path. Path is " - "specified similar to dir1:dir2:dir3. If the path " - "starts with @, loaded providers are given preference " - "based on discovery order, rather than version. " - "(default: " PROVDLDIR ")"); - - fi_param_get_str(NULL, "provider_path", &provdir); - if (!provdir || !strlen(provdir)) { - ofi_find_prov_libs(); - dirs = ofi_split_and_alloc(PROVDLDIR, ":", NULL); - } else if (provdir[0] == '@') { - prov_order = OFI_PROV_ORDER_REGISTER; - if (strlen(provdir) == 1) - dirs = ofi_split_and_alloc(PROVDLDIR, ":", NULL); - else - dirs = ofi_split_and_alloc(&provdir[1], ":", NULL); - } else { - dirs = ofi_split_and_alloc(provdir, ":", NULL); - } - - if (dirs) { - for (i = 0; dirs[i]; i++) - ofi_ini_dir(dirs[i]); - - ofi_free_string_array(dirs); - } -} - -#else /* HAVE_LIBDL */ - -static void ofi_load_dl_prov(void) -{ -} - -#endif - -static char **hooks; -static size_t hook_cnt; - -/* - * Call the fabric() interface of the hooking provider. We pass in the - * fabric being hooked via the fabric attributes and the corresponding - * fi_provider structure as the context. - */ -static void ofi_hook_install(struct fid_fabric *hfabric, - struct fid_fabric **fabric, - struct fi_provider *prov) -{ - struct fi_provider *hook_prov; - struct fi_fabric_attr attr; - int i, ret; - - *fabric = hfabric; - if (!hook_cnt || !hooks) - return; - - memset(&attr, 0, sizeof attr); - - for (i = 0; i < hook_cnt; i++) { - hook_prov = ofi_get_hook(hooks[i]); - if (!hook_prov) - continue; - - attr.fabric = hfabric; - ret = hook_prov->fabric(&attr, fabric, prov); - if (ret) - continue; - - hfabric = *fabric; - } -} - -static void ofi_hook_init(void) -{ - char *param_val = NULL; - - fi_param_define(NULL, "hook", FI_PARAM_STRING, - "Intercept calls to underlying provider and apply " - "the specified functionality to them. Hook option: " - "perf (gather performance data)"); - fi_param_get_str(NULL, "hook", ¶m_val); - - if (!param_val) - return; - - hooks = ofi_split_and_alloc(param_val, ";", &hook_cnt); -} - -static void ofi_hook_fini(void) -{ - if (hooks) - ofi_free_string_array(hooks); -} - -void fi_ini(void) -{ - char *param_val = NULL; - - pthread_mutex_lock(&common_locks.ini_lock); - - if (ofi_init) - goto unlock; - - ofi_ordered_provs_init(); - fi_param_init(); - fi_log_init(); - ofi_osd_init(); - ofi_mem_init(); - ofi_pmem_init(); - ofi_perf_init(); - ofi_hook_init(); - ofi_hmem_init(); - ofi_monitors_init(); - - fi_param_define(NULL, "provider", FI_PARAM_STRING, - "Only use specified provider (default: all available)"); - fi_param_get_str(NULL, "provider", ¶m_val); - ofi_create_filter(&prov_filter, param_val); - - fi_param_define(NULL, "fork_unsafe", FI_PARAM_BOOL, - "Whether use of fork() may be unsafe for some providers " - "(default: no). Setting this to yes could improve " - "performance at the expense of making fork() potentially " - "unsafe"); - fi_param_define(NULL, "universe_size", FI_PARAM_SIZE_T, - "Defines the maximum number of processes that will be " - "used by distribute OFI application. The provider uses " - "this to optimize resource allocations " - "(default: provider specific)"); - fi_param_get_size_t(NULL, "universe_size", &ofi_universe_size); - - fi_param_define(NULL, "poll_fairness", FI_PARAM_INT, - "This counter value controls calling poll() on a list " - "of sockets and file descriptors and is most relevant " - "when using the tcp provider with the pollfd wait " - "object. The pollfd abstraction maintains a list of " - "active or hot fd's that it monitors. This variable " - "controls the number of times that the active fd's " - "list is checked relative to the full set of fd's " - "being monitored. A value of 0 disables the active " - "list. Default (%d)", ofi_poll_fairness); - fi_param_get_int(NULL, "poll_fairness", &ofi_poll_fairness); - - ofi_load_dl_prov(); - - ofi_register_provider(PSM3_INIT, NULL); - ofi_register_provider(PSM2_INIT, NULL); - ofi_register_provider(PSM_INIT, NULL); - ofi_register_provider(USNIC_INIT, NULL); - ofi_register_provider(GNI_INIT, NULL); - ofi_register_provider(BGQ_INIT, NULL); - ofi_register_provider(NETDIR_INIT, NULL); - ofi_register_provider(SHM_INIT, NULL); - ofi_register_provider(RXM_INIT, NULL); - ofi_register_provider(VERBS_INIT, NULL); - /* ofi_register_provider(RSTREAM_INIT, NULL); - no support */ - ofi_register_provider(MRAIL_INIT, NULL); - ofi_register_provider(RXD_INIT, NULL); - ofi_register_provider(EFA_INIT, NULL); - ofi_register_provider(OPX_INIT, NULL); - ofi_register_provider(UDP_INIT, NULL); - ofi_register_provider(SOCKETS_INIT, NULL); - ofi_register_provider(TCP_INIT, NULL); - - ofi_register_provider(HOOK_PERF_INIT, NULL); - ofi_register_provider(HOOK_DEBUG_INIT, NULL); - ofi_register_provider(HOOK_HMEM_INIT, NULL); - ofi_register_provider(HOOK_DMABUF_PEER_MEM_INIT, NULL); - ofi_register_provider(HOOK_NOOP_INIT, NULL); - - ofi_init = 1; - -unlock: - pthread_mutex_unlock(&common_locks.ini_lock); -} - -FI_DESTRUCTOR(fi_fini(void)) -{ - struct ofi_prov *prov; - - pthread_mutex_lock(&common_locks.ini_lock); - - if (!ofi_init) - goto unlock; - - while (prov_head) { - prov = prov_head; - prov_head = prov->next; - ofi_free_prov(prov); - } - - ofi_free_filter(&prov_filter); - ofi_monitors_cleanup(); - ofi_hmem_cleanup(); - ofi_hook_fini(); - ofi_mem_fini(); - fi_log_fini(); - fi_param_fini(); - ofi_osd_fini(); - - ofi_init = 0; - -unlock: - pthread_mutex_unlock(&common_locks.ini_lock); -} - -__attribute__((visibility ("default"),EXTERNALLY_VISIBLE)) -void DEFAULT_SYMVER_PRE(fi_freeinfo)(struct fi_info *info) -{ - struct fi_info *next; - - for (; info; info = next) { - next = info->next; - - free(info->src_addr); - free(info->dest_addr); - free(info->tx_attr); - free(info->rx_attr); - if (info->ep_attr) { - free(info->ep_attr->auth_key); - free(info->ep_attr); - } - if (info->domain_attr) { - free(info->domain_attr->auth_key); - free(info->domain_attr->name); - free(info->domain_attr); - } - if (info->fabric_attr) { - free(info->fabric_attr->name); - free(info->fabric_attr->prov_name); - free(info->fabric_attr); - } - if (info->nic && - FI_CHECK_OP(info->nic->fid.ops, struct fi_ops, close)) { - fi_close(&info->nic->fid); - } - free(info); - } -} -DEFAULT_SYMVER(fi_freeinfo_, fi_freeinfo, FABRIC_1.3); - -/* - * Make a dummy info object for each provider, and copy in the - * provider name and version. We report utility providers directly - * to export their version. - */ -static int ofi_getprovinfo(struct fi_info **info) -{ - struct ofi_prov *prov; - struct fi_info *tail, *cur; - int ret = -FI_ENODATA; - - *info = tail = NULL; - for (prov = prov_head; prov; prov = prov->next) { - if (!prov->provider) - continue; - - cur = fi_allocinfo(); - if (!cur) { - ret = -FI_ENOMEM; - goto err; - } - - cur->fabric_attr->prov_name = strdup(prov->provider->name); - cur->fabric_attr->prov_version = prov->provider->version; - - if (!*info) { - *info = tail = cur; - } else { - tail->next = cur; - } - tail = cur; - - ret = 0; - } - - return ret; - -err: - while (tail) { - cur = tail->next; - fi_freeinfo(tail); - tail = cur; - } - return ret; -} - -static void ofi_set_prov_attr(struct fi_fabric_attr *attr, - struct fi_provider *prov) -{ - char *core_name; - - core_name = attr->prov_name; - if (core_name) { - assert(ofi_is_util_prov(prov)); - attr->prov_name = ofi_strdup_append(core_name, prov->name); - free(core_name); - } else { - assert(ofi_is_core_prov(prov)); - attr->prov_name = strdup(prov->name); - } - attr->prov_version = prov->version; -} - -/* - * The layering of utility providers over core providers follows these rules. - * 0. Provider names are delimited by ";" - * 1. Rules when # of providers <= 2: - * 1a. If both are specified, then only return that layering - * 1b. If a utility provider is specified, return it over any* core provider. - * 1c. If a core provider is specified, return any utility provider that can - * layer over it, plus the core provider itself, if possible. - * 1d. A utility provider will not layer over a provider that has disabled - * utility provider layering unless the user explicitly requests that - * combination. - * 1e. OFI_CORE_PROV_ONLY flag prevents utility providers layering over other - * utility providers. - * 2. If both the providers are utility providers or if more than two providers - * are specified, the rightmost provider would be compared. - * 3. If any provider has a caret symbol "^" is prefixed before any provider - * name it would be excluded (internal use only). These excluded providers - * should be listed only at the end. - */ -static int ofi_layering_ok(const struct fi_provider *provider, - char **prov_vec, size_t count, - uint64_t flags) -{ - char *prov_name; - struct ofi_prov *core_ofi_prov; - ssize_t i; - - /* Excluded providers must be at the end */ - for (i = count - 1; i >= 0; i--) { - if (prov_vec[i][0] != '^') - break; - - if (!strcasecmp(&prov_vec[i][1], provider->name)) - return 0; - } - count = i + 1; - - if (flags & OFI_CORE_PROV_ONLY) { - assert((count == 1) || (count == 0)); - if (!ofi_is_core_prov(provider)) { - FI_INFO(&core_prov, FI_LOG_CORE, - "Need core provider, skipping %s\n", - provider->name); - return 0; - } - - if ((count == 0) && ofi_disable_util_layering(provider)) { - FI_INFO(&core_prov, FI_LOG_CORE, - "Skipping util;%s layering\n", provider->name); - return 0; - } - } - - if (!count) - return 1; - - /* To maintain backward compatibility with the previous behavior of - * ofi_layering_ok we need to check if the # of providers is two or - * fewer. In such a case, we have to be agnostic to the ordering of - * core and utility providers */ - - if ((count == 1) && ofi_is_util_prov(provider) && - !ofi_has_util_prefix(prov_vec[0])) { - core_ofi_prov = ofi_getprov(prov_vec[0], strlen(prov_vec[0])); - if (core_ofi_prov && core_ofi_prov->provider && - ofi_disable_util_layering(core_ofi_prov->provider)) { - FI_INFO(&core_prov, FI_LOG_CORE, - "Skipping %s;%s layering\n", prov_vec[0], - provider->name); - return 0; - } - return 1; - } - - if ((count == 2) && ofi_has_util_prefix(prov_vec[0]) && - !ofi_has_util_prefix(prov_vec[1])) - prov_name = prov_vec[0]; - else - prov_name = prov_vec[count - 1]; - - return !strcasecmp(provider->name, prov_name); -} - -__attribute__((visibility ("default"),EXTERNALLY_VISIBLE)) -int DEFAULT_SYMVER_PRE(fi_getinfo)(uint32_t version, const char *node, - const char *service, uint64_t flags, - const struct fi_info *hints, struct fi_info **info) -{ - struct ofi_prov *prov; - struct fi_info *tail, *cur; - char **prov_vec = NULL; - size_t count = 0; - enum fi_log_level level; - int ret; - - fi_ini(); - - if (FI_VERSION_LT(fi_version(), version)) { - FI_WARN(&core_prov, FI_LOG_CORE, - "Requested version is newer than library\n"); - return -FI_ENOSYS; - } - - if (flags == FI_PROV_ATTR_ONLY) { - return ofi_getprovinfo(info); - } - - if (hints && hints->fabric_attr && hints->fabric_attr->prov_name) { - prov_vec = ofi_split_and_alloc(hints->fabric_attr->prov_name, - ";", &count); - if (!prov_vec) - return -FI_ENOMEM; - FI_DBG(&core_prov, FI_LOG_CORE, "hints prov_name: %s\n", - hints->fabric_attr->prov_name); - } - - *info = tail = NULL; - for (prov = prov_head; prov; prov = prov->next) { - if (!prov->provider || !prov->provider->getinfo) - continue; - - if (prov->hidden && !(flags & OFI_GETINFO_HIDDEN)) - continue; - - if (!ofi_layering_ok(prov->provider, prov_vec, count, flags)) - continue; - - if (FI_VERSION_LT(prov->provider->fi_version, version)) { - FI_WARN(&core_prov, FI_LOG_CORE, - "Provider %s fi_version %d.%d < requested %d.%d\n", - prov->provider->name, - FI_MAJOR(prov->provider->fi_version), - FI_MINOR(prov->provider->fi_version), - FI_MAJOR(version), FI_MINOR(version)); - continue; - } - - cur = NULL; - ret = prov->provider->getinfo(version, node, service, flags, - hints, &cur); - if (ret) { - level = ((hints && hints->fabric_attr && - hints->fabric_attr->prov_name) ? - FI_LOG_WARN : FI_LOG_INFO); - - FI_LOG(&core_prov, level, FI_LOG_CORE, - "fi_getinfo: provider %s returned -%d (%s)\n", - prov->provider->name, -ret, fi_strerror(-ret)); - continue; - } - - if (!cur) { - FI_WARN(&core_prov, FI_LOG_CORE, - "fi_getinfo: provider %s output empty list\n", - prov->provider->name); - continue; - } - - FI_DBG(&core_prov, FI_LOG_CORE, "fi_getinfo: provider %s " - "returned success\n", prov->provider->name); - - if (!*info) - *info = cur; - else - tail->next = cur; - - for (tail = cur; tail->next; tail = tail->next) { - ofi_set_prov_attr(tail->fabric_attr, prov->provider); - tail->fabric_attr->api_version = version; - } - ofi_set_prov_attr(tail->fabric_attr, prov->provider); - tail->fabric_attr->api_version = version; - } - ofi_free_string_array(prov_vec); - - if (!(flags & (OFI_CORE_PROV_ONLY | OFI_GETINFO_INTERNAL | - OFI_GETINFO_HIDDEN))) - ofi_filter_info(info); - - return *info ? 0 : -FI_ENODATA; -} -DEFAULT_SYMVER(fi_getinfo_, fi_getinfo, FABRIC_1.3); - -struct fi_info *ofi_allocinfo_internal(void) -{ - struct fi_info *info; - - info = calloc(1, sizeof(*info)); - if (!info) - return NULL; - - info->tx_attr = calloc(1, sizeof(*info->tx_attr)); - info->rx_attr = calloc(1, sizeof(*info->rx_attr)); - info->ep_attr = calloc(1, sizeof(*info->ep_attr)); - info->domain_attr = calloc(1, sizeof(*info->domain_attr)); - info->fabric_attr = calloc(1, sizeof(*info->fabric_attr)); - if (!info->tx_attr|| !info->rx_attr || !info->ep_attr || - !info->domain_attr || !info->fabric_attr) - goto err; - - return info; -err: - fi_freeinfo(info); - return NULL; -} - - -__attribute__((visibility ("default"),EXTERNALLY_VISIBLE)) -struct fi_info *DEFAULT_SYMVER_PRE(fi_dupinfo)(const struct fi_info *info) -{ - struct fi_info *dup; - int ret; - - if (!info) - return ofi_allocinfo_internal(); - - dup = mem_dup(info, sizeof(*dup)); - if (dup == NULL) { - return NULL; - } - dup->src_addr = NULL; - dup->dest_addr = NULL; - dup->tx_attr = NULL; - dup->rx_attr = NULL; - dup->ep_attr = NULL; - dup->domain_attr = NULL; - dup->fabric_attr = NULL; - dup->next = NULL; - - if (info->src_addr != NULL) { - dup->src_addr = mem_dup(info->src_addr, info->src_addrlen); - if (dup->src_addr == NULL) - goto fail; - } - if (info->dest_addr != NULL) { - dup->dest_addr = mem_dup(info->dest_addr, info->dest_addrlen); - if (dup->dest_addr == NULL) - goto fail; - } - if (info->tx_attr != NULL) { - dup->tx_attr = mem_dup(info->tx_attr, sizeof(*info->tx_attr)); - if (dup->tx_attr == NULL) - goto fail; - } - if (info->rx_attr != NULL) { - dup->rx_attr = mem_dup(info->rx_attr, sizeof(*info->rx_attr)); - if (dup->rx_attr == NULL) - goto fail; - } - if (info->ep_attr != NULL) { - dup->ep_attr = mem_dup(info->ep_attr, sizeof(*info->ep_attr)); - if (dup->ep_attr == NULL) - goto fail; - if (info->ep_attr->auth_key != NULL) { - dup->ep_attr->auth_key = - mem_dup(info->ep_attr->auth_key, - info->ep_attr->auth_key_size); - if (dup->ep_attr->auth_key == NULL) - goto fail; - } - } - if (info->domain_attr) { - dup->domain_attr = mem_dup(info->domain_attr, - sizeof(*info->domain_attr)); - if (dup->domain_attr == NULL) - goto fail; - dup->domain_attr->name = NULL; - dup->domain_attr->auth_key = NULL; - if (info->domain_attr->name != NULL) { - dup->domain_attr->name = strdup(info->domain_attr->name); - if (dup->domain_attr->name == NULL) - goto fail; - } - if (info->domain_attr->auth_key != NULL) { - dup->domain_attr->auth_key = - mem_dup(info->domain_attr->auth_key, - info->domain_attr->auth_key_size); - if (dup->domain_attr->auth_key == NULL) - goto fail; - } - } - if (info->fabric_attr) { - dup->fabric_attr = mem_dup(info->fabric_attr, - sizeof(*info->fabric_attr)); - if (dup->fabric_attr == NULL) - goto fail; - dup->fabric_attr->name = NULL; - dup->fabric_attr->prov_name = NULL; - if (info->fabric_attr->name != NULL) { - dup->fabric_attr->name = strdup(info->fabric_attr->name); - if (dup->fabric_attr->name == NULL) - goto fail; - } - if (info->fabric_attr->prov_name != NULL) { - dup->fabric_attr->prov_name = strdup(info->fabric_attr->prov_name); - if (dup->fabric_attr->prov_name == NULL) - goto fail; - } - } - - if (info->nic) { - ret = fi_control(&info->nic->fid, FI_DUP, &dup->nic); - if (ret && ret != -FI_ENOSYS) - goto fail; - } - - return dup; - -fail: - fi_freeinfo(dup); - return NULL; -} -DEFAULT_SYMVER(fi_dupinfo_, fi_dupinfo, FABRIC_1.3); - -__attribute__((visibility ("default"),EXTERNALLY_VISIBLE)) -int DEFAULT_SYMVER_PRE(fi_fabric)(struct fi_fabric_attr *attr, - struct fid_fabric **fabric, void *context) -{ - struct ofi_prov *prov; - const char *top_name; - int ret; - - if (!attr || !attr->prov_name || !attr->name) - return -FI_EINVAL; - - fi_ini(); - - top_name = strrchr(attr->prov_name, OFI_NAME_DELIM); - if (top_name) - top_name++; - else - top_name = attr->prov_name; - - if (!top_name) - return -FI_EINVAL; - - prov = ofi_getprov(top_name, strlen(top_name)); - if (!prov || !prov->provider || !prov->provider->fabric) - return -FI_ENODEV; - - ret = prov->provider->fabric(attr, fabric, context); - if (!ret) { - if (FI_VERSION_GE(prov->provider->fi_version, FI_VERSION(1, 5))) - (*fabric)->api_version = attr->api_version; - FI_INFO(&core_prov, FI_LOG_CORE, "Opened fabric: %s\n", - attr->name); - - ofi_hook_install(*fabric, fabric, prov->provider); - } - - return ret; -} -DEFAULT_SYMVER(fi_fabric_, fi_fabric, FABRIC_1.1); - -__attribute__((visibility ("default"),EXTERNALLY_VISIBLE)) -uint32_t DEFAULT_SYMVER_PRE(fi_version)(void) -{ - return FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION); -} -DEFAULT_SYMVER(fi_version_, fi_version, FABRIC_1.0); - -__attribute__((visibility ("default"),EXTERNALLY_VISIBLE)) -int DEFAULT_SYMVER_PRE(fi_open)(uint32_t version, const char *name, - void *attr, size_t attr_len, uint64_t flags, - struct fid **fid, void *context) -{ - if (!strcasecmp("mr_cache", name)) - return ofi_open_mr_cache(version, attr, attr_len, - flags, fid, context); - - return -FI_ENOSYS; -} -DEFAULT_SYMVER(fi_open_, fi_open, FABRIC_1.5); - -static const char *const errstr[] = { - [FI_EOTHER - FI_ERRNO_OFFSET] = "Unspecified error", - [FI_ETOOSMALL - FI_ERRNO_OFFSET] = "Provided buffer is too small", - [FI_EOPBADSTATE - FI_ERRNO_OFFSET] = "Operation not permitted in current state", - [FI_EAVAIL - FI_ERRNO_OFFSET] = "Error available", - [FI_EBADFLAGS - FI_ERRNO_OFFSET] = "Flags not supported", - [FI_ENOEQ - FI_ERRNO_OFFSET] = "Missing or unavailable event queue", - [FI_EDOMAIN - FI_ERRNO_OFFSET] = "Invalid resource domain", - [FI_ENOCQ - FI_ERRNO_OFFSET] = "Missing or unavailable completion queue", - [FI_ECRC - FI_ERRNO_OFFSET] = "CRC error", - [FI_ETRUNC - FI_ERRNO_OFFSET] = "Truncation error", - [FI_ENOKEY - FI_ERRNO_OFFSET] = "Required key not available", - [FI_ENOAV - FI_ERRNO_OFFSET] = "Missing or unavailable address vector", - [FI_EOVERRUN - FI_ERRNO_OFFSET] = "Queue has been overrun", - [FI_ENORX - FI_ERRNO_OFFSET] = "Receiver not ready, no receive buffers available", -}; - -__attribute__((visibility ("default"),EXTERNALLY_VISIBLE)) -const char *DEFAULT_SYMVER_PRE(fi_strerror)(int errnum) -{ - if (errnum < 0) - errnum = -errnum; - - if (errnum < FI_ERRNO_OFFSET) - return strerror(errnum); - else if (errnum < FI_ERRNO_MAX) - return errstr[errnum - FI_ERRNO_OFFSET]; - else - return errstr[FI_EOTHER - FI_ERRNO_OFFSET]; -} -DEFAULT_SYMVER(fi_strerror_, fi_strerror, FABRIC_1.0); diff --git a/shared/fi_tostr.c b/shared/fi_tostr.c deleted file mode 100644 index 0fd0c3a..0000000 --- a/shared/fi_tostr.c +++ /dev/null @@ -1,894 +0,0 @@ -/* - * Copyright (c) 2014-2017 Intel Corp., Inc. All rights reserved. - * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "config.h" - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "ofi.h" -#include -#include -#include -#include -#include - - -/* Print fi_info and related structs, enums, OR_able flags, addresses. - * - * Each printable type should be well formatted YAML. - * - * A struct is a dictionary containing one key named after the struct tag - * which contains a dictionary of member-value mappings. The struct member - * keys are the field names (not the types). - * - * Enum values are currently just bare strings. - * OR-able flags are a list of the values, ie: [ VAL1, VAL2 ] - * - * YAML does not contain tabs. - * Indentation delineates lists and dictionaries (or they can be inline). - * - * Printing functions are generally named after this pattern: - * - * struct fi_info : ofi_tostr_info(..., struct fi_info, ...) - * fi_info->caps : ofi_tostr_caps(..., typeof(caps), ...) - */ - - -static void -ofi_tostr_fid(const char *label, char *buf, size_t len, const struct fid *fid) -{ - if (!fid || !FI_CHECK_OP(fid->ops, struct fi_ops, tostr)) - ofi_strncatf(buf, len, "%s%p\n", label, fid); - else - fid->ops->tostr(fid, buf, len - strnlen(buf, len)); -} - -static void ofi_tostr_opflags(char *buf, size_t len, uint64_t flags) -{ - IFFLAGSTRN(flags, FI_MULTICAST, len); - - IFFLAGSTRN(flags, FI_MULTI_RECV, len); - IFFLAGSTRN(flags, FI_REMOTE_CQ_DATA, len); - IFFLAGSTRN(flags, FI_MORE, len); - IFFLAGSTRN(flags, FI_PEEK, len); - IFFLAGSTRN(flags, FI_TRIGGER, len); - IFFLAGSTRN(flags, FI_FENCE, len); - - IFFLAGSTRN(flags, FI_COMPLETION, len); - IFFLAGSTRN(flags, FI_INJECT, len); - IFFLAGSTRN(flags, FI_INJECT_COMPLETE, len); - IFFLAGSTRN(flags, FI_TRANSMIT_COMPLETE, len); - IFFLAGSTRN(flags, FI_DELIVERY_COMPLETE, len); - IFFLAGSTRN(flags, FI_MATCH_COMPLETE, len); - IFFLAGSTRN(flags, FI_AFFINITY, len); - - IFFLAGSTRN(flags, FI_CLAIM, len); - IFFLAGSTRN(flags, FI_DISCARD, len); - - ofi_remove_comma(buf); -} - -static void ofi_tostr_addr_format(char *buf, size_t len, uint32_t addr_format) -{ - switch (addr_format) { - CASEENUMSTRN(FI_FORMAT_UNSPEC, len); - CASEENUMSTRN(FI_SOCKADDR, len); - CASEENUMSTRN(FI_SOCKADDR_IN, len); - CASEENUMSTRN(FI_SOCKADDR_IN6, len); - CASEENUMSTRN(FI_SOCKADDR_IB, len); - CASEENUMSTRN(FI_ADDR_PSMX, len); - CASEENUMSTRN(FI_ADDR_PSMX2, len); - CASEENUMSTRN(FI_ADDR_GNI, len); - CASEENUMSTRN(FI_ADDR_BGQ, len); - CASEENUMSTRN(FI_ADDR_MLX, len); - CASEENUMSTRN(FI_ADDR_STR, len); - CASEENUMSTRN(FI_ADDR_IB_UD, len); - CASEENUMSTRN(FI_ADDR_EFA, len); - CASEENUMSTRN(FI_ADDR_PSMX3, len); - CASEENUMSTRN(FI_ADDR_OPX, len); - default: - if (addr_format & FI_PROV_SPECIFIC) - ofi_strncatf(buf, len, "Provider specific"); - else - ofi_strncatf(buf, len, "Unknown"); - break; - } -} - -static void ofi_tostr_progress(char *buf, size_t len, enum fi_progress progress) -{ - switch (progress) { - CASEENUMSTRN(FI_PROGRESS_UNSPEC, len); - CASEENUMSTRN(FI_PROGRESS_AUTO, len); - CASEENUMSTRN(FI_PROGRESS_MANUAL, len); - default: - ofi_strncatf(buf, len, "Unknown"); - break; - } -} - -static void -ofi_tostr_threading(char *buf, size_t len, enum fi_threading threading) -{ - switch (threading) { - CASEENUMSTRN(FI_THREAD_UNSPEC, len); - CASEENUMSTRN(FI_THREAD_SAFE, len); - CASEENUMSTRN(FI_THREAD_FID, len); - CASEENUMSTRN(FI_THREAD_DOMAIN, len); - CASEENUMSTRN(FI_THREAD_COMPLETION, len); - CASEENUMSTRN(FI_THREAD_ENDPOINT, len); - default: - ofi_strncatf(buf, len, "Unknown"); - break; - } -} - -static void ofi_tostr_msgorder(char *buf, size_t len, uint64_t flags) -{ - IFFLAGSTRN(flags, FI_ORDER_RAR, len); - IFFLAGSTRN(flags, FI_ORDER_RAW, len); - IFFLAGSTRN(flags, FI_ORDER_RAS, len); - IFFLAGSTRN(flags, FI_ORDER_WAR, len); - IFFLAGSTRN(flags, FI_ORDER_WAW, len); - IFFLAGSTRN(flags, FI_ORDER_WAS, len); - IFFLAGSTRN(flags, FI_ORDER_SAR, len); - IFFLAGSTRN(flags, FI_ORDER_SAW, len); - IFFLAGSTRN(flags, FI_ORDER_SAS, len); - IFFLAGSTRN(flags, FI_ORDER_RMA_RAR, len); - IFFLAGSTRN(flags, FI_ORDER_RMA_RAW, len); - IFFLAGSTRN(flags, FI_ORDER_RMA_WAR, len); - IFFLAGSTRN(flags, FI_ORDER_RMA_WAW, len); - IFFLAGSTRN(flags, FI_ORDER_ATOMIC_RAR, len); - IFFLAGSTRN(flags, FI_ORDER_ATOMIC_RAW, len); - IFFLAGSTRN(flags, FI_ORDER_ATOMIC_WAR, len); - IFFLAGSTRN(flags, FI_ORDER_ATOMIC_WAW, len); - - ofi_remove_comma(buf); -} - -static void ofi_tostr_comporder(char *buf, size_t len, uint64_t flags) -{ - if ((flags & FI_ORDER_STRICT) == FI_ORDER_NONE) { - ofi_strncatf(buf, len, "FI_ORDER_NONE, "); - } else if ((flags & FI_ORDER_STRICT) == FI_ORDER_STRICT) { - ofi_strncatf(buf, len, "FI_ORDER_STRICT, "); - } - - IFFLAGSTRN(flags, FI_ORDER_DATA, len); - - ofi_remove_comma(buf); -} - -static void ofi_tostr_caps(char *buf, size_t len, uint64_t caps) -{ - IFFLAGSTRN(caps, FI_MSG, len); - IFFLAGSTRN(caps, FI_RMA, len); - IFFLAGSTRN(caps, FI_TAGGED, len); - IFFLAGSTRN(caps, FI_ATOMIC, len); - IFFLAGSTRN(caps, FI_MULTICAST, len); - IFFLAGSTRN(caps, FI_COLLECTIVE, len); - - IFFLAGSTRN(caps, FI_READ, len); - IFFLAGSTRN(caps, FI_WRITE, len); - IFFLAGSTRN(caps, FI_RECV, len); - IFFLAGSTRN(caps, FI_SEND, len); - IFFLAGSTRN(caps, FI_REMOTE_READ, len); - IFFLAGSTRN(caps, FI_REMOTE_WRITE, len); - - IFFLAGSTRN(caps, FI_MULTI_RECV, len); - IFFLAGSTRN(caps, FI_REMOTE_CQ_DATA, len); - IFFLAGSTRN(caps, FI_TRIGGER, len); - IFFLAGSTRN(caps, FI_FENCE, len); - - IFFLAGSTRN(caps, FI_VARIABLE_MSG, len); - IFFLAGSTRN(caps, FI_RMA_PMEM, len); - IFFLAGSTRN(caps, FI_SOURCE_ERR, len); - IFFLAGSTRN(caps, FI_LOCAL_COMM, len); - IFFLAGSTRN(caps, FI_REMOTE_COMM, len); - IFFLAGSTRN(caps, FI_SHARED_AV, len); - IFFLAGSTRN(caps, FI_RMA_EVENT, len); - IFFLAGSTRN(caps, FI_SOURCE, len); - IFFLAGSTRN(caps, FI_NAMED_RX_CTX, len); - IFFLAGSTRN(caps, FI_DIRECTED_RECV, len); - IFFLAGSTRN(caps, FI_HMEM, len); - - ofi_remove_comma(buf); -} - -static void ofi_tostr_ep_type(char *buf, size_t len, enum fi_ep_type ep_type) -{ - switch (ep_type) { - CASEENUMSTRN(FI_EP_UNSPEC, len); - CASEENUMSTRN(FI_EP_MSG, len); - CASEENUMSTRN(FI_EP_DGRAM, len); - CASEENUMSTRN(FI_EP_RDM, len); - CASEENUMSTRN(FI_EP_SOCK_STREAM, len); - CASEENUMSTRN(FI_EP_SOCK_DGRAM, len); - default: - ofi_strncatf(buf, len, "Unknown"); - break; - } -} - -static void ofi_tostr_protocol(char *buf, size_t len, uint32_t protocol) -{ - switch (protocol) { - CASEENUMSTRN(FI_PROTO_UNSPEC, len); - CASEENUMSTRN(FI_PROTO_RDMA_CM_IB_RC, len); - CASEENUMSTRN(FI_PROTO_IWARP, len); - CASEENUMSTRN(FI_PROTO_IB_UD, len); - CASEENUMSTRN(FI_PROTO_PSMX, len); - CASEENUMSTRN(FI_PROTO_PSMX2, len); - CASEENUMSTRN(FI_PROTO_UDP, len); - CASEENUMSTRN(FI_PROTO_SOCK_TCP, len); - CASEENUMSTRN(FI_PROTO_IB_RDM, len); - CASEENUMSTRN(FI_PROTO_IWARP_RDM, len); - CASEENUMSTRN(FI_PROTO_GNI, len); - CASEENUMSTRN(FI_PROTO_RXM, len); - CASEENUMSTRN(FI_PROTO_RXD, len); - CASEENUMSTRN(FI_PROTO_MLX, len); - CASEENUMSTRN(FI_PROTO_NETWORKDIRECT, len); - CASEENUMSTRN(FI_PROTO_SHM, len); - CASEENUMSTRN(FI_PROTO_RSTREAM, len); - CASEENUMSTRN(FI_PROTO_RDMA_CM_IB_XRC, len); - CASEENUMSTRN(FI_PROTO_EFA, len); - CASEENUMSTRN(FI_PROTO_PSMX3, len); - CASEENUMSTRN(FI_PROTO_RXM_TCP, len); - CASEENUMSTRN(FI_PROTO_OPX, len); - default: - if (protocol & FI_PROV_SPECIFIC) - ofi_strncatf(buf, len, "Provider specific"); - else - ofi_strncatf(buf, len, "Unknown"); - break; - } -} - -static void ofi_tostr_mode(char *buf, size_t len, uint64_t mode) -{ - IFFLAGSTRN(mode, FI_CONTEXT, len); - IFFLAGSTRN(mode, FI_MSG_PREFIX, len); - IFFLAGSTRN(mode, FI_ASYNC_IOV, len); - IFFLAGSTRN(mode, FI_RX_CQ_DATA, len); - IFFLAGSTRN(mode, FI_LOCAL_MR, len); - IFFLAGSTRN(mode, FI_NOTIFY_FLAGS_ONLY, len); - IFFLAGSTRN(mode, FI_RESTRICTED_COMP, len); - IFFLAGSTRN(mode, FI_CONTEXT2, len); - IFFLAGSTRN(mode, FI_BUFFERED_RECV, len); - - ofi_remove_comma(buf); -} - -static void -ofi_tostr_addr(char *buf, size_t len, uint32_t addr_format, void *addr) -{ - char *p; - size_t addrlen; - - p = buf + strlen(buf); - addrlen = len - strlen(buf); - - if (addr == NULL) { - ofi_strncatf(p, addrlen, "(null)"); - return; - } - - ofi_straddr(p, &addrlen, addr_format, addr); -} - -static void -ofi_tostr_tx_attr(char *buf, size_t len, const struct fi_tx_attr *attr, - const char *prefix) -{ - if (!attr) { - ofi_strncatf(buf, len, "%sfi_tx_attr: (null)\n", prefix); - return; - } - - ofi_strncatf(buf, len, "%sfi_tx_attr:\n", prefix); - ofi_strncatf(buf, len, "%s%scaps: [ ", prefix, TAB); - ofi_tostr_caps(buf, len, attr->caps); - ofi_strncatf(buf, len, " ]\n"); - - ofi_strncatf(buf, len, "%s%smode: [ ", prefix, TAB); - ofi_tostr_mode(buf, len, attr->mode); - ofi_strncatf(buf, len, " ]\n"); - - ofi_strncatf(buf, len, "%s%sop_flags: [ ", prefix, TAB); - ofi_tostr_opflags(buf, len, attr->op_flags); - ofi_strncatf(buf, len, " ]\n"); - - ofi_strncatf(buf, len, "%s%smsg_order: [ ", prefix, TAB); - ofi_tostr_msgorder(buf, len, attr->msg_order); - ofi_strncatf(buf, len, " ]\n"); - - ofi_strncatf(buf, len, "%s%scomp_order: [ ", prefix, TAB); - ofi_tostr_comporder(buf, len, attr->comp_order); - ofi_strncatf(buf, len, " ]\n"); - - ofi_strncatf(buf, len, "%s%sinject_size: %zu\n", prefix, TAB, - attr->inject_size); - ofi_strncatf(buf, len, "%s%ssize: %zu\n", prefix, TAB, attr->size); - ofi_strncatf(buf, len, "%s%siov_limit: %zu\n", prefix, TAB, - attr->iov_limit); - ofi_strncatf(buf, len, "%s%srma_iov_limit: %zu\n", prefix, TAB, - attr->rma_iov_limit); - ofi_strncatf(buf, len, "%s%stclass: 0x%x\n", prefix, TAB, attr->tclass); -} - -static void -ofi_tostr_rx_attr(char *buf, size_t len, const struct fi_rx_attr *attr, - const char *prefix) -{ - if (!attr) { - ofi_strncatf(buf, len, "%sfi_rx_attr: (null)\n", prefix); - return; - } - - ofi_strncatf(buf, len, "%sfi_rx_attr:\n", prefix); - ofi_strncatf(buf, len, "%s%scaps: [ ", prefix, TAB); - ofi_tostr_caps(buf, len, attr->caps); - ofi_strncatf(buf, len, " ]\n"); - - ofi_strncatf(buf, len, "%s%smode: [ ", prefix, TAB); - ofi_tostr_mode(buf, len, attr->mode); - ofi_strncatf(buf, len, " ]\n"); - - ofi_strncatf(buf, len, "%s%sop_flags: [ ", prefix, TAB); - ofi_tostr_opflags(buf, len, attr->op_flags); - ofi_strncatf(buf, len, " ]\n"); - - ofi_strncatf(buf, len, "%s%smsg_order: [ ", prefix, TAB); - ofi_tostr_msgorder(buf, len, attr->msg_order); - ofi_strncatf(buf, len, " ]\n"); - - ofi_strncatf(buf, len, "%s%scomp_order: [ ", prefix, TAB); - ofi_tostr_comporder(buf, len, attr->comp_order); - ofi_strncatf(buf, len, " ]\n"); - - ofi_strncatf(buf, len, "%s%stotal_buffered_recv: %zu\n", prefix, TAB, - attr->total_buffered_recv); - ofi_strncatf(buf, len, "%s%ssize: %zu\n", prefix, TAB, attr->size); - ofi_strncatf(buf, len, "%s%siov_limit: %zu\n", prefix, TAB, - attr->iov_limit); -} - -static void -ofi_tostr_ep_attr(char *buf, size_t len, const struct fi_ep_attr *attr, - const char *prefix) -{ - if (!attr) { - ofi_strncatf(buf, len, "%sfi_ep_attr: (null)\n", prefix); - return; - } - - ofi_strncatf(buf, len, "%sfi_ep_attr:\n", prefix); - ofi_strncatf(buf, len, "%s%stype: ", prefix, TAB); - ofi_tostr_ep_type(buf, len, attr->type); - ofi_strncatf(buf, len, "\n"); - ofi_strncatf(buf, len, "%s%sprotocol: ", prefix, TAB); - ofi_tostr_protocol(buf, len, attr->protocol); - ofi_strncatf(buf, len, "\n"); - ofi_strncatf(buf, len, "%s%sprotocol_version: %d\n", prefix, TAB, - attr->protocol_version); - ofi_strncatf(buf, len, "%s%smax_msg_size: %zu\n", prefix, TAB, - attr->max_msg_size); - ofi_strncatf(buf, len, "%s%smsg_prefix_size: %zu\n", prefix, TAB, - attr->msg_prefix_size); - ofi_strncatf(buf, len, "%s%smax_order_raw_size: %zu\n", prefix, TAB, - attr->max_order_raw_size); - ofi_strncatf(buf, len, "%s%smax_order_war_size: %zu\n", prefix, TAB, - attr->max_order_war_size); - ofi_strncatf(buf, len, "%s%smax_order_waw_size: %zu\n", prefix, TAB, - attr->max_order_waw_size); - ofi_strncatf(buf, len, "%s%smem_tag_format: 0x%016llx\n", prefix, TAB, - attr->mem_tag_format); - - ofi_strncatf(buf, len, "%s%stx_ctx_cnt: ", prefix, TAB); - if (attr->tx_ctx_cnt == FI_SHARED_CONTEXT) - ofi_strncatf(buf, len, "FI_SHARED_CONTEXT\n"); - else - ofi_strncatf(buf, len, "%zu\n", attr->tx_ctx_cnt); - ofi_strncatf(buf, len, "%s%srx_ctx_cnt: ", prefix, TAB); - if (attr->rx_ctx_cnt == FI_SHARED_CONTEXT) - ofi_strncatf(buf, len, "FI_SHARED_CONTEXT\n"); - else - ofi_strncatf(buf, len, "%zu\n", attr->rx_ctx_cnt); - - ofi_strncatf(buf, len, "%s%sauth_key_size: %zu\n", prefix, TAB, - attr->auth_key_size); -} - -static void -ofi_tostr_resource_mgmt(char *buf, size_t len, enum fi_resource_mgmt rm) -{ - switch (rm) { - CASEENUMSTRN(FI_RM_UNSPEC, len); - CASEENUMSTRN(FI_RM_DISABLED, len); - CASEENUMSTRN(FI_RM_ENABLED, len); - default: - ofi_strncatf(buf, len, "Unknown"); - break; - } -} - -static void ofi_tostr_av_type(char *buf, size_t len, enum fi_av_type type) -{ - switch (type) { - CASEENUMSTRN(FI_AV_UNSPEC, len); - CASEENUMSTRN(FI_AV_MAP, len); - CASEENUMSTRN(FI_AV_TABLE, len); - default: - ofi_strncatf(buf, len, "Unknown"); - break; - } -} - -static void ofi_tostr_mr_mode(char *buf, size_t len, int mr_mode) -{ - IFFLAGSTRN(mr_mode, FI_MR_BASIC, len); - IFFLAGSTRN(mr_mode, FI_MR_SCALABLE, len); - IFFLAGSTRN(mr_mode, FI_MR_LOCAL, len); - IFFLAGSTRN(mr_mode, FI_MR_RAW, len); - IFFLAGSTRN(mr_mode, FI_MR_VIRT_ADDR, len); - IFFLAGSTRN(mr_mode, FI_MR_ALLOCATED, len); - IFFLAGSTRN(mr_mode, FI_MR_PROV_KEY, len); - IFFLAGSTRN(mr_mode, FI_MR_MMU_NOTIFY, len); - IFFLAGSTRN(mr_mode, FI_MR_RMA_EVENT, len); - IFFLAGSTRN(mr_mode, FI_MR_ENDPOINT, len); - IFFLAGSTRN(mr_mode, FI_MR_HMEM, len); - IFFLAGSTRN(mr_mode, FI_MR_COLLECTIVE, len); - - ofi_remove_comma(buf); -} - -static void ofi_tostr_op_type(char *buf, size_t len, int op_type) -{ - switch (op_type) { - CASEENUMSTRN(FI_OP_RECV, len); - CASEENUMSTRN(FI_OP_SEND, len); - CASEENUMSTRN(FI_OP_TRECV, len); - CASEENUMSTRN(FI_OP_TSEND, len); - CASEENUMSTRN(FI_OP_READ, len); - CASEENUMSTRN(FI_OP_WRITE, len); - CASEENUMSTRN(FI_OP_ATOMIC, len); - CASEENUMSTRN(FI_OP_FETCH_ATOMIC, len); - CASEENUMSTRN(FI_OP_COMPARE_ATOMIC, len); - CASEENUMSTRN(FI_OP_CNTR_SET, len); - CASEENUMSTRN(FI_OP_CNTR_ADD, len); - default: - ofi_strncatf(buf, len, "Unknown"); - break; - } -} - -static void -ofi_tostr_domain_attr(char *buf, size_t len, const struct fi_domain_attr *attr, - const char *prefix) -{ - if (!attr) { - ofi_strncatf(buf, len, "%sfi_domain_attr: (null)\n", prefix); - return; - } - - ofi_strncatf(buf, len, "%sfi_domain_attr:\n", prefix); - - ofi_strncatf(buf, len, "%s%sdomain: 0x%x\n", prefix, TAB, attr->domain); - - ofi_strncatf(buf, len, "%s%sname: %s\n", prefix, TAB, attr->name); - ofi_strncatf(buf, len, "%s%sthreading: ", prefix, TAB); - ofi_tostr_threading(buf, len, attr->threading); - ofi_strncatf(buf, len, "\n"); - - ofi_strncatf(buf, len, "%s%scontrol_progress: ", prefix,TAB); - ofi_tostr_progress(buf, len, attr->control_progress); - ofi_strncatf(buf, len, "\n"); - ofi_strncatf(buf, len, "%s%sdata_progress: ", prefix, TAB); - ofi_tostr_progress(buf, len, attr->data_progress); - ofi_strncatf(buf, len, "\n"); - ofi_strncatf(buf, len, "%s%sresource_mgmt: ", prefix, TAB); - ofi_tostr_resource_mgmt(buf, len, attr->resource_mgmt); - ofi_strncatf(buf, len, "\n"); - ofi_strncatf(buf, len, "%s%sav_type: ", prefix, TAB); - ofi_tostr_av_type(buf, len, attr->av_type); - ofi_strncatf(buf, len, "\n"); - ofi_strncatf(buf, len, "%s%smr_mode: [ ", prefix, TAB); - ofi_tostr_mr_mode(buf, len, attr->mr_mode); - ofi_strncatf(buf, len, " ]\n"); - - ofi_strncatf(buf, len, "%s%smr_key_size: %zu\n", prefix, TAB, - attr->mr_key_size); - ofi_strncatf(buf, len, "%s%scq_data_size: %zu\n", prefix, TAB, - attr->cq_data_size); - ofi_strncatf(buf, len, "%s%scq_cnt: %zu\n", prefix, TAB, - attr->cq_cnt); - ofi_strncatf(buf, len, "%s%sep_cnt: %zu\n", prefix, TAB, attr->ep_cnt); - ofi_strncatf(buf, len, "%s%stx_ctx_cnt: %zu\n", prefix, TAB, - attr->tx_ctx_cnt); - ofi_strncatf(buf, len, "%s%srx_ctx_cnt: %zu\n", prefix, TAB, - attr->rx_ctx_cnt); - ofi_strncatf(buf, len, "%s%smax_ep_tx_ctx: %zu\n", prefix, TAB, - attr->max_ep_tx_ctx); - ofi_strncatf(buf, len, "%s%smax_ep_rx_ctx: %zu\n", prefix, TAB, - attr->max_ep_rx_ctx); - ofi_strncatf(buf, len, "%s%smax_ep_stx_ctx: %zu\n", prefix, TAB, - attr->max_ep_stx_ctx); - ofi_strncatf(buf, len, "%s%smax_ep_srx_ctx: %zu\n", prefix, TAB, - attr->max_ep_srx_ctx); - ofi_strncatf(buf, len, "%s%scntr_cnt: %zu\n", prefix, TAB, - attr->cntr_cnt); - ofi_strncatf(buf, len, "%s%smr_iov_limit: %zu\n", prefix, TAB, - attr->mr_iov_limit); - - ofi_strncatf(buf, len, "%s%scaps: [ ", prefix, TAB); - ofi_tostr_caps(buf, len, attr->caps); - ofi_strncatf(buf, len, " ]\n"); - - ofi_strncatf(buf, len, "%s%smode: [ ", prefix, TAB); - ofi_tostr_mode(buf, len, attr->mode); - ofi_strncatf(buf, len, " ]\n"); - - ofi_strncatf(buf, len, "%s%sauth_key_size: %zu\n", prefix, TAB, - attr->auth_key_size); - ofi_strncatf(buf, len, "%s%smax_err_data: %zu\n", prefix, TAB, - attr->max_err_data); - ofi_strncatf(buf, len, "%s%smr_cnt: %zu\n", prefix, TAB, attr->mr_cnt); - ofi_strncatf(buf, len, "%s%stclass: 0x%x\n", prefix, TAB, attr->tclass); -} - -static void -ofi_tostr_fabric_attr(char *buf, size_t len, const struct fi_fabric_attr *attr, - const char *prefix) -{ - if (!attr) { - ofi_strncatf(buf, len, "%sfi_fabric_attr: (null)\n", prefix); - return; - } - - ofi_strncatf(buf, len, "%sfi_fabric_attr:\n", prefix); - ofi_strncatf(buf, len, "%s%sname: %s\n", prefix, TAB, attr->name); - ofi_strncatf(buf, len, "%s%sprov_name: %s\n", prefix, TAB, - attr->prov_name); - ofi_strncatf(buf, len, "%s%sprov_version: %d.%d\n", prefix, TAB, - FI_MAJOR(attr->prov_version), FI_MINOR(attr->prov_version)); - ofi_strncatf(buf, len, "%s%sapi_version: %d.%d\n", prefix, TAB, - FI_MAJOR(attr->api_version), FI_MINOR(attr->api_version)); -} - -static void ofi_tostr_info(char *buf, size_t len, const struct fi_info *info) -{ - ofi_strncatf(buf, len, "fi_info:\n"); - ofi_strncatf(buf, len, "%scaps: [ ", TAB); - ofi_tostr_caps(buf, len, info->caps); - ofi_strncatf(buf, len, " ]\n"); - - ofi_strncatf(buf, len, "%smode: [ ", TAB); - ofi_tostr_mode(buf, len, info->mode); - ofi_strncatf(buf, len, " ]\n"); - - ofi_strncatf(buf, len, "%saddr_format: ", TAB); - ofi_tostr_addr_format(buf, len, info->addr_format); - ofi_strncatf(buf, len, "\n"); - - ofi_strncatf(buf, len, "%ssrc_addrlen: %zu\n", TAB, info->src_addrlen); - ofi_strncatf(buf, len, "%sdest_addrlen: %zu\n", TAB, - info->dest_addrlen); - ofi_strncatf(buf, len, "%ssrc_addr: ", TAB); - ofi_tostr_addr(buf, len, info->addr_format, info->src_addr); - ofi_strncatf(buf, len, "\n"); - ofi_strncatf(buf, len, "%sdest_addr: ", TAB); - ofi_tostr_addr(buf, len, info->addr_format, info->dest_addr); - ofi_strncatf(buf, len, "\n"); - ofi_tostr_fid(TAB "handle: ", buf, len, info->handle); - - ofi_tostr_tx_attr(buf, len, info->tx_attr, TAB); - ofi_tostr_rx_attr(buf, len, info->rx_attr, TAB); - ofi_tostr_ep_attr(buf, len, info->ep_attr, TAB); - ofi_tostr_domain_attr(buf, len, info->domain_attr, TAB); - ofi_tostr_fabric_attr(buf, len, info->fabric_attr, TAB); - ofi_tostr_fid(TAB "nic: ", buf, len, &info->nic->fid); -} - -static void ofi_tostr_atomic_type(char *buf, size_t len, enum fi_datatype type) -{ - switch (type) { - CASEENUMSTRN(FI_INT8, len); - CASEENUMSTRN(FI_UINT8, len); - CASEENUMSTRN(FI_INT16, len); - CASEENUMSTRN(FI_UINT16, len); - CASEENUMSTRN(FI_INT32, len); - CASEENUMSTRN(FI_UINT32, len); - CASEENUMSTRN(FI_INT64, len); - CASEENUMSTRN(FI_UINT64, len); - CASEENUMSTRN(FI_INT128, len); - CASEENUMSTRN(FI_UINT128, len); - CASEENUMSTRN(FI_FLOAT, len); - CASEENUMSTRN(FI_DOUBLE, len); - CASEENUMSTRN(FI_FLOAT_COMPLEX, len); - CASEENUMSTRN(FI_DOUBLE_COMPLEX, len); - CASEENUMSTRN(FI_LONG_DOUBLE, len); - CASEENUMSTRN(FI_LONG_DOUBLE_COMPLEX, len); - default: - ofi_strncatf(buf, len, "Unknown"); - break; - } -} - -static void ofi_tostr_atomic_op(char *buf, size_t len, enum fi_op op) -{ - switch (op) { - CASEENUMSTRN(FI_MIN, len); - CASEENUMSTRN(FI_MAX, len); - CASEENUMSTRN(FI_SUM, len); - CASEENUMSTRN(FI_PROD, len); - CASEENUMSTRN(FI_LOR, len); - CASEENUMSTRN(FI_LAND, len); - CASEENUMSTRN(FI_BOR, len); - CASEENUMSTRN(FI_BAND, len); - CASEENUMSTRN(FI_LXOR, len); - CASEENUMSTRN(FI_BXOR, len); - CASEENUMSTRN(FI_ATOMIC_READ, len); - CASEENUMSTRN(FI_ATOMIC_WRITE, len); - CASEENUMSTRN(FI_CSWAP, len); - CASEENUMSTRN(FI_CSWAP_NE, len); - CASEENUMSTRN(FI_CSWAP_LE, len); - CASEENUMSTRN(FI_CSWAP_LT, len); - CASEENUMSTRN(FI_CSWAP_GE, len); - CASEENUMSTRN(FI_CSWAP_GT, len); - CASEENUMSTRN(FI_MSWAP, len); - default: - ofi_strncatf(buf, len, "Unknown"); - break; - } -} - -static void -ofi_tostr_collective_op(char *buf, size_t len, enum fi_collective_op op) -{ - switch (op) { - CASEENUMSTRN(FI_BARRIER, len); - CASEENUMSTRN(FI_BROADCAST, len); - CASEENUMSTRN(FI_ALLTOALL, len); - CASEENUMSTRN(FI_ALLREDUCE, len); - CASEENUMSTRN(FI_ALLGATHER, len); - CASEENUMSTRN(FI_REDUCE_SCATTER, len); - CASEENUMSTRN(FI_REDUCE, len); - CASEENUMSTRN(FI_SCATTER, len); - CASEENUMSTRN(FI_GATHER, len); - default: - ofi_strncatf(buf, len, "Unknown"); - break; - } -} - -static void ofi_tostr_version(char *buf, size_t len) -{ - ofi_strncatf(buf, len, VERSION); - ofi_strncatf(buf, len, BUILD_ID); -} - -static void ofi_tostr_eq_event(char *buf, size_t len, int type) -{ - switch (type) { - CASEENUMSTRN(FI_NOTIFY, len); - CASEENUMSTRN(FI_CONNREQ, len); - CASEENUMSTRN(FI_CONNECTED, len); - CASEENUMSTRN(FI_SHUTDOWN, len); - CASEENUMSTRN(FI_MR_COMPLETE, len); - CASEENUMSTRN(FI_AV_COMPLETE, len); - CASEENUMSTRN(FI_JOIN_COMPLETE, len); - default: - ofi_strncatf(buf, len, "Unknown"); - break; - } -} - -static void ofi_tostr_cq_event_flags(char *buf, size_t len, uint64_t flags) -{ - IFFLAGSTRN(flags, FI_SEND, len); - IFFLAGSTRN(flags, FI_RECV, len); - IFFLAGSTRN(flags, FI_RMA, len); - IFFLAGSTRN(flags, FI_ATOMIC, len); - IFFLAGSTRN(flags, FI_MSG, len); - IFFLAGSTRN(flags, FI_TAGGED, len); - IFFLAGSTRN(flags, FI_READ, len); - IFFLAGSTRN(flags, FI_WRITE, len); - IFFLAGSTRN(flags, FI_REMOTE_READ, len); - IFFLAGSTRN(flags, FI_REMOTE_WRITE, len); - IFFLAGSTRN(flags, FI_REMOTE_CQ_DATA, len); - IFFLAGSTRN(flags, FI_MULTI_RECV, len); - IFFLAGSTRN(flags, FI_MORE, len); - IFFLAGSTRN(flags, FI_CLAIM, len); - ofi_remove_comma(buf); -} - -static void -ofi_tostr_hmem_iface(char *buf, size_t len, enum fi_hmem_iface iface) -{ - switch (iface) { - CASEENUMSTRN(FI_HMEM_SYSTEM, len); - CASEENUMSTRN(FI_HMEM_CUDA, len); - CASEENUMSTRN(FI_HMEM_ROCR, len); - CASEENUMSTRN(FI_HMEM_ZE, len); - CASEENUMSTRN(FI_HMEM_NEURON, len); - default: - ofi_strncatf(buf, len, "Unknown"); - break; - } -} - -static void -ofi_tostr_cq_format(char *buf, size_t len, enum fi_cq_format cq_format) -{ - switch (cq_format) { - CASEENUMSTRN(FI_CQ_FORMAT_UNSPEC, len); - CASEENUMSTRN(FI_CQ_FORMAT_CONTEXT, len); - CASEENUMSTRN(FI_CQ_FORMAT_MSG, len); - CASEENUMSTRN(FI_CQ_FORMAT_DATA, len); - CASEENUMSTRN(FI_CQ_FORMAT_TAGGED, len); - default: - ofi_strncatf(buf, len, "Unknown"); - break; - } -} - -__attribute__((visibility ("default"),EXTERNALLY_VISIBLE)) -char *DEFAULT_SYMVER_PRE(fi_tostr_r)(char *buf, size_t len, - const void *data, enum fi_type datatype) -{ - const uint64_t *val64; - const uint32_t *val32; - const int *enumval; - - if (!data || !buf || !len) - return NULL; - - val64 = (const uint64_t *) data; - val32 = (const uint32_t *) data; - enumval = (const int *) data; - - buf[0] = '\0'; - - switch (datatype) { - case FI_TYPE_INFO: - ofi_tostr_info(buf, len, data); - break; - case FI_TYPE_EP_TYPE: - ofi_tostr_ep_type(buf, len, *enumval); - break; - case FI_TYPE_CAPS: - ofi_tostr_caps(buf, len, *val64); - break; - case FI_TYPE_OP_FLAGS: - ofi_tostr_opflags(buf, len, *val64); - break; - case FI_TYPE_ADDR_FORMAT: - ofi_tostr_addr_format(buf, len, *val32); - break; - case FI_TYPE_TX_ATTR: - ofi_tostr_tx_attr(buf, len, data, ""); - break; - case FI_TYPE_RX_ATTR: - ofi_tostr_rx_attr(buf, len, data, ""); - break; - case FI_TYPE_EP_ATTR: - ofi_tostr_ep_attr(buf, len, data, ""); - break; - case FI_TYPE_DOMAIN_ATTR: - ofi_tostr_domain_attr(buf, len, data, ""); - break; - case FI_TYPE_FABRIC_ATTR: - ofi_tostr_fabric_attr(buf, len, data, ""); - break; - case FI_TYPE_THREADING: - ofi_tostr_threading(buf, len, *enumval); - break; - case FI_TYPE_PROGRESS: - ofi_tostr_progress(buf, len, *enumval); - break; - case FI_TYPE_PROTOCOL: - ofi_tostr_protocol(buf, len, *val32); - break; - case FI_TYPE_MSG_ORDER: - ofi_tostr_msgorder(buf, len, *val64); - break; - case FI_TYPE_MODE: - ofi_tostr_mode(buf, len, *val64); - break; - case FI_TYPE_AV_TYPE: - ofi_tostr_av_type(buf, len, *enumval); - break; - case FI_TYPE_ATOMIC_TYPE: - ofi_tostr_atomic_type(buf, len, *enumval); - break; - case FI_TYPE_ATOMIC_OP: - ofi_tostr_atomic_op(buf, len, *enumval); - break; - case FI_TYPE_VERSION: - ofi_tostr_version(buf, len); - break; - case FI_TYPE_EQ_EVENT: - ofi_tostr_eq_event(buf, len, *enumval); - break; - case FI_TYPE_CQ_EVENT_FLAGS: - ofi_tostr_cq_event_flags(buf, len, *val64); - break; - case FI_TYPE_MR_MODE: - /* mr_mode was an enum converted to int flags */ - ofi_tostr_mr_mode(buf, len, *enumval); - break; - case FI_TYPE_OP_TYPE: - ofi_tostr_op_type(buf, len, *enumval); - break; - case FI_TYPE_FID: - ofi_tostr_fid("fid: ", buf, len, data); - break; - case FI_TYPE_COLLECTIVE_OP: - ofi_tostr_collective_op(buf, len, *enumval); - break; - case FI_TYPE_HMEM_IFACE: - ofi_tostr_hmem_iface(buf, len, *enumval); - break; - case FI_TYPE_CQ_FORMAT: - ofi_tostr_cq_format(buf, len, *enumval); - break; - default: - ofi_strncatf(buf, len, "Unknown type"); - break; - } - return buf; -} -DEFAULT_SYMVER(fi_tostr_r_, fi_tostr_r, FABRIC_1.4); - -__attribute__((visibility ("default"),EXTERNALLY_VISIBLE)) -char *DEFAULT_SYMVER_PRE(fi_tostr)(const void *data, enum fi_type datatype) -{ - static char *buf = NULL; - size_t len = 8192; - - if (!buf) { - buf = calloc(len, 1); - if (!buf) - return NULL; - } - - return fi_tostr_r(buf, len, data, datatype); -} -DEFAULT_SYMVER(fi_tostr_, fi_tostr, FABRIC_1.0); diff --git a/shared/hmem_synapseai.c b/shared/hmem_synapseai.c deleted file mode 100644 index 3940f7d..0000000 --- a/shared/hmem_synapseai.c +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#if HAVE_CONFIG_H -#include -#endif - -#include "ofi_hmem.h" -#include "ofi.h" - -int synapseai_init(void) -{ - return -FI_ENOSYS; -} - -int synapseai_cleanup(void) -{ - return -FI_ENOSYS; -} - -int synapseai_copy_to_hmem(uint64_t device, void *dest, const void *src, - size_t size) -{ - return -FI_ENOSYS; -} - -int synapseai_copy_from_hmem(uint64_t device, void *dest, const void *src, - size_t size) -{ - return -FI_ENOSYS; -} - -bool synapseai_is_addr_valid(const void *addr, uint64_t *device, - uint64_t *flags) -{ - return false; -} - -int synapseai_get_handle(void *dev_buf, void **handle) -{ - return -FI_ENOSYS; -} - -int synapseai_open_handle(void **handle, uint64_t device, void **ipc_ptr) -{ - return -FI_ENOSYS; -} - -int synapseai_close_handle(void *ipc_ptr) -{ - return -FI_ENOSYS; -} - -int synapseai_host_register(void *ptr, size_t size) -{ - return -FI_ENOSYS; -} - -int synapseai_host_unregister(void *ptr) -{ - return -FI_ENOSYS; -} - -int synapseai_get_base_addr(const void *ptr, void **base, size_t *size) -{ - return -FI_ENOSYS; -} - -bool synapseai_is_ipc_enabled(void) -{ - return false; -} diff --git a/shared/log.c b/shared/log.c deleted file mode 100644 index 775e7cf..0000000 --- a/shared/log.c +++ /dev/null @@ -1,193 +0,0 @@ -/* - * Copyright (c) 2015-2016, Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2015, Intel Corp., Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ - -#include -#include -#include - -#include - -#include "ofi.h" - - -static const char * const log_subsys[] = { - [FI_LOG_CORE] = "core", - [FI_LOG_FABRIC] = "fabric", - [FI_LOG_DOMAIN] = "domain", - [FI_LOG_EP_CTRL] = "ep_ctrl", - [FI_LOG_EP_DATA] = "ep_data", - [FI_LOG_AV] = "av", - [FI_LOG_CQ] = "cq", - [FI_LOG_EQ] = "eq", - [FI_LOG_MR] = "mr", - [FI_LOG_CNTR] = "cntr", - [FI_LOG_SUBSYS_MAX] = NULL -}; - -static const char * const log_levels[] = { - [FI_LOG_WARN] = "warn", - [FI_LOG_TRACE] = "trace", - [FI_LOG_INFO] = "info", - [FI_LOG_DEBUG] = "debug", - [FI_LOG_MAX] = NULL -}; - -enum { - FI_LOG_SUBSYS_OFFSET = FI_LOG_MAX, - FI_LOG_PROV_OFFSET = FI_LOG_SUBSYS_OFFSET + FI_LOG_SUBSYS_MAX, - FI_LOG_LEVEL_MASK = ((1 << FI_LOG_MAX) - 1), - FI_LOG_SUBSYS_MASK = (((1 << FI_LOG_SUBSYS_MAX) - 1) << - FI_LOG_SUBSYS_OFFSET), -// FI_LOG_PROV_MASK = (((1 << (64 - FI_LOG_PROV_OFFSET)) - 1) << -// FI_LOG_PROV_OFFSET) -}; - -#define FI_LOG_TAG(prov, level, subsys) \ - (((uint64_t) prov << FI_LOG_PROV_OFFSET) | \ - ((uint64_t) (1 << (subsys + FI_LOG_SUBSYS_OFFSET))) | \ - ((uint64_t) (1 << level))) - -static int log_interval = 2000; -uint64_t log_mask; -struct fi_filter prov_log_filter; - -static pid_t pid; - -static int fi_convert_log_str(const char *value) -{ - int i; - - if (!value) - return -1; - - for (i = 0; log_levels[i]; i++) { - if (!strcasecmp(value, log_levels[i])) - return i; - } - return 0; -} - -void fi_log_init(void) -{ - struct fi_filter subsys_filter; - int level, i; - char *levelstr = NULL, *provstr = NULL, *subsysstr = NULL; - - fi_param_define(NULL, "log_interval", FI_PARAM_INT, - "Delay in ms between rate limited log messages " - "(default 2000)"); - fi_param_get_int(NULL, "log_interval", &log_interval); - - fi_param_define(NULL, "log_level", FI_PARAM_STRING, - "Specify logging level: warn, trace, info, debug (default: warn)"); - fi_param_get_str(NULL, "log_level", &levelstr); - level = fi_convert_log_str(levelstr); - if (level >= 0) - log_mask = ((1 << (level + 1)) - 1); - - fi_param_define(NULL, "log_prov", FI_PARAM_STRING, - "Specify specific provider to log (default: all)"); - fi_param_get_str(NULL, "log_prov", &provstr); - ofi_create_filter(&prov_log_filter, provstr); - - fi_param_define(NULL, "log_subsys", FI_PARAM_STRING, - "Specify specific subsystem to log (default: all)"); - fi_param_get_str(NULL, "log_subsys", &subsysstr); - ofi_create_filter(&subsys_filter, subsysstr); - for (i = 0; i < FI_LOG_SUBSYS_MAX; i++) { - if (!ofi_apply_filter(&subsys_filter, log_subsys[i])) - log_mask |= (1ULL << (i + FI_LOG_SUBSYS_OFFSET)); - } - ofi_free_filter(&subsys_filter); - pid = getpid(); -} - -void fi_log_fini(void) -{ - ofi_free_filter(&prov_log_filter); -} - -__attribute__((visibility ("default"),EXTERNALLY_VISIBLE)) -int DEFAULT_SYMVER_PRE(fi_log_enabled)(const struct fi_provider *prov, - enum fi_log_level level, - enum fi_log_subsys subsys) -{ - struct fi_prov_context *ctx; - - ctx = (struct fi_prov_context *) &prov->context; - return ((FI_LOG_TAG(ctx->disable_logging, level, subsys) & log_mask) == - FI_LOG_TAG(ctx->disable_logging, level, subsys)); -} -DEFAULT_SYMVER(fi_log_enabled_, fi_log_enabled, FABRIC_1.0); - -__attribute__((visibility ("default"),EXTERNALLY_VISIBLE)) -int DEFAULT_SYMVER_PRE(fi_log_ready)(const struct fi_provider *prov, - enum fi_log_level level, enum fi_log_subsys subsys, - uint64_t *showtime) -{ - uint64_t cur; - - if (fi_log_enabled(prov, level, subsys)) { - cur = ofi_gettime_ms(); - if (cur >= *showtime) { - *showtime = cur + (uint64_t) log_interval; - return true; - } - } - return false; -} -CURRENT_SYMVER(fi_log_ready_, fi_log_ready); - -__attribute__((visibility ("default"),EXTERNALLY_VISIBLE)) -void DEFAULT_SYMVER_PRE(fi_log)(const struct fi_provider *prov, enum fi_log_level level, - enum fi_log_subsys subsys, const char *func, int line, - const char *fmt, ...) -{ - char buf[1024]; - int size; - - va_list vargs; - - size = snprintf(buf, sizeof(buf), "%s:%d:%ld:%s:%s:%s:%s():%d<%s> ", - PACKAGE, pid, (unsigned long) time(NULL), log_prefix, - prov->name, log_subsys[subsys], func, line, - log_levels[level]); - - va_start(vargs, fmt); - vsnprintf(buf + size, sizeof(buf) - size, fmt, vargs); - va_end(vargs); - - fprintf(stderr, "%s", buf); -} -DEFAULT_SYMVER(fi_log_, fi_log, FABRIC_1.0); diff --git a/shared/perf.c b/shared/perf.c deleted file mode 100644 index 1cd2a65..0000000 --- a/shared/perf.c +++ /dev/null @@ -1,150 +0,0 @@ -/* - * Copyright (c) 2018 Intel Corporation. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - - -#include -#include -#include -#include -#include - -#include -#include -#include - - -enum ofi_perf_domain perf_domain = OFI_PMU_CPU; -uint32_t perf_cntr = OFI_PMC_CPU_INSTR; -uint32_t perf_flags; - - -void ofi_perf_init(void) -{ - char *param_val = NULL; - - fi_param_define(NULL, "perf_cntr", FI_PARAM_STRING, - "Performance counter to analyze (default: cpu_instr). " - "Options: cpu_instr, cpu_cycles."); - fi_param_get_str(NULL, "perf_cntr", ¶m_val); - if (!param_val) - return; - - if (!strcasecmp(param_val, "cpu_cycles")) { - perf_domain = OFI_PMU_CPU; - perf_cntr = OFI_PMC_CPU_CYCLES; - } -} - -int ofi_perfset_create(const struct fi_provider *prov, - struct ofi_perfset *set, size_t size, - enum ofi_perf_domain domain, uint32_t cntr_id, - uint32_t flags) -{ - int ret; - - ret = ofi_pmu_open(&set->ctx, domain, cntr_id, flags); - if (ret) { - FI_WARN(prov, FI_LOG_CORE, "Unable to open PMU %d (%s)\n", - ret, fi_strerror(ret)); - return ret; - } - - set->data = calloc(size, sizeof(*set->data)); - if (!set->data) { - ofi_pmu_close(set->ctx); - return -FI_ENOMEM; - } - - set->prov = prov; - set->size = size; - return 0; -} - -void ofi_perfset_close(struct ofi_perfset *set) -{ - ofi_pmu_close(set->ctx); - free(set->data); -} - -static const char *ofi_perf_name(void) -{ - switch (perf_domain) { - case OFI_PMU_CPU: - switch (perf_cntr) { - case OFI_PMC_CPU_CYCLES: - return "CPU cycles"; - case OFI_PMC_CPU_INSTR: - return "CPU instr"; - } - break; - case OFI_PMU_CACHE: - switch (perf_cntr) { - case OFI_PMC_CACHE_L1_DATA: - return "L1 data cache"; - case OFI_PMC_CACHE_L1_INSTR: - return "L1 instr cache"; - case OFI_PMC_CACHE_TLB_DATA: - return "TLB data cache"; - case OFI_PMC_CACHE_TLB_INSTR: - return "TLB instr cache"; - } - break; - case OFI_PMU_OS: - switch (perf_cntr) { - case OFI_PMC_OS_PAGE_FAULT: - return "page faults"; - } - break; - case OFI_PMU_NIC: - break; - } - return "unknown"; -} - -void ofi_perfset_log(struct ofi_perfset *set, const char *names[]) -{ - size_t i; - - FI_TRACE(set->prov, FI_LOG_CORE, "\n"); - FI_TRACE(set->prov, FI_LOG_CORE, "\tPERF: %s\n", ofi_perf_name()); - FI_TRACE(set->prov, FI_LOG_CORE, "\t%-20s%-10s%s\n", "Name", "Avg", "Events"); - - for (i = 0; i < set->size; i++) { - if (!set->data[i].events) - continue; - - FI_TRACE(set->prov, FI_LOG_CORE, "\t%-20s%-10g%" PRIu64 "\n", - names && names[i] ? names[i] : "unknown", - (double) set->data[i].sum / set->data[i].events, - set->data[i].events); - } -} diff --git a/shared/var.c b/shared/var.c deleted file mode 100644 index 6f4a9db..0000000 --- a/shared/var.c +++ /dev/null @@ -1,337 +0,0 @@ -/* - * Copyright (c) 2015-2016, Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2015, Intel Corp., Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ - -#include -#include -#include -#include - -#include - -#include "ofi.h" -#include "ofi_list.h" - - -extern void fi_ini(void); - -struct fi_param_entry { - const struct fi_provider *provider; - char *name; - enum fi_param_type type; - char *help_string; - char *env_var_name; - struct dlist_entry entry; -}; - -/* TODO: Add locking around param_list when adding dynamic removal */ -static DEFINE_LIST(param_list); - - -static struct fi_param_entry * -fi_find_param(const struct fi_provider *provider, const char *param_name) -{ - struct fi_param_entry *param; - struct dlist_entry *entry; - - for (entry = param_list.next; entry != ¶m_list; entry = entry->next) { - param = container_of(entry, struct fi_param_entry, entry); - if (param->provider == provider && - strcmp(param->name, param_name) == 0) { - return param; - } - } - - FI_DBG(provider, FI_LOG_CORE, - "Failed to find parameter %s: was not defined\n", param_name); - return NULL; -} - -__attribute__((visibility ("default"),EXTERNALLY_VISIBLE)) -int DEFAULT_SYMVER_PRE(fi_getparams)(struct fi_param **params, int *count) -{ - struct fi_param *vhead = NULL; - struct fi_param_entry *param; - struct dlist_entry *entry; - int cnt, i; - char *tmp; - - fi_ini(); - - for (entry = param_list.next, cnt = 0; entry != ¶m_list; - entry = entry->next) - cnt++; - - if (cnt == 0) - goto out; - - // last extra entry will be all NULL - vhead = calloc(cnt + 1, sizeof (*vhead)); - if (!vhead) - return -FI_ENOMEM; - - for (entry = param_list.next, i = 0; entry != ¶m_list; - entry = entry->next, i++) { - param = container_of(entry, struct fi_param_entry, entry); - vhead[i].name = strdup(param->env_var_name); - vhead[i].type = param->type; - vhead[i].help_string = strdup(param->help_string); - - tmp = getenv(param->env_var_name); - if (tmp) - vhead[i].value = strdup(tmp); - - if (!vhead[i].name || !vhead[i].help_string) { - fi_freeparams(vhead); - return -FI_ENOMEM; - } - } - -out: - *count = cnt; - *params = vhead; - return FI_SUCCESS; -} -DEFAULT_SYMVER(fi_getparams_, fi_getparams, FABRIC_1.0); - -__attribute__((visibility ("default"),EXTERNALLY_VISIBLE)) -void DEFAULT_SYMVER_PRE(fi_freeparams)(struct fi_param *params) -{ - int i; - for (i = 0; params[i].name; ++i) { - free((void*) params[i].name); - free((void*) params[i].help_string); - free((void*) params[i].value); - } - free(params); -} -DEFAULT_SYMVER(fi_freeparams_, fi_freeparams, FABRIC_1.0); - -static void fi_free_param(struct fi_param_entry *param) -{ - free(param->name); - free(param->help_string); - free(param->env_var_name); - free(param); -} - -void fi_param_undefine(const struct fi_provider *provider) -{ - struct fi_param_entry *param; - struct dlist_entry *entry; - struct dlist_entry *next; - - for (entry = param_list.next; entry != ¶m_list; entry = next) { - next = entry->next; - param = container_of(entry, struct fi_param_entry, entry); - if (param->provider == provider) { - FI_DBG(provider, FI_LOG_CORE, "Removing param: %s\n", param->name); - dlist_remove(entry); - fi_free_param(param); - } - } -} - -__attribute__((visibility ("default"),EXTERNALLY_VISIBLE)) -int DEFAULT_SYMVER_PRE(fi_param_define)(const struct fi_provider *provider, - const char *param_name, enum fi_param_type type, - const char *help_string_fmt, ...) -{ - int i, ret; - struct fi_param_entry *v; - char *tmp_str; - va_list vargs; - - if (!provider) - provider = &core_prov; - - // Check for bozo cases - if (param_name == NULL || help_string_fmt == NULL || *help_string_fmt == '\0') { - FI_DBG(provider, FI_LOG_CORE, - "Failed to register %s variable: provider coding error\n", - param_name); - return -FI_EINVAL; - } - - v = calloc(1, sizeof(*v)); - if (!v) { - FI_DBG(provider, FI_LOG_CORE, - "Failed to register %s variable: ENOMEM\n", param_name); - return -FI_ENOMEM; - } - - v->provider = provider; - v->name = strdup(param_name); - v->type = type; - - va_start(vargs, help_string_fmt); - ret = vasprintf(&v->help_string, help_string_fmt, vargs); - va_end(vargs); - if (ret < 0) - v->help_string = NULL; - - if (provider != &core_prov) { - ret = asprintf(&tmp_str, "%s: %s", provider->name, v->help_string); - free(v->help_string); - if (ret < 0) - v->help_string = NULL; - v->help_string = tmp_str; - ret = asprintf(&v->env_var_name, "FI_%s_%s", provider->name, param_name); - if (ret < 0) - v->env_var_name = NULL; - } else { - ret = asprintf(&v->env_var_name, "FI_%s", param_name); - if (ret < 0) - v->env_var_name = NULL; - } - if (!v->name || !v->help_string || !v->env_var_name) { - fi_free_param(v); - FI_DBG(provider, FI_LOG_CORE, - "Failed to register %s variable: ENOMEM\n", param_name); - return -FI_ENOMEM; - } - - for (i = 0; v->env_var_name[i]; ++i) - v->env_var_name[i] = (char) toupper(v->env_var_name[i]); - - dlist_insert_tail(&v->entry, ¶m_list); - - FI_DBG(provider, FI_LOG_CORE, "registered var %s\n", param_name); - return FI_SUCCESS; -} -DEFAULT_SYMVER(fi_param_define_, fi_param_define, FABRIC_1.0); - -static int fi_parse_bool(const char *str_value) -{ - if (strcmp(str_value, "0") == 0 || - strcasecmp(str_value, "false") == 0 || - strcasecmp(str_value, "no") == 0 || - strcasecmp(str_value, "off") == 0) { - return 0; - } - - if (strcmp(str_value, "1") == 0 || - strcasecmp(str_value, "true") == 0 || - strcasecmp(str_value, "yes") == 0 || - strcasecmp(str_value, "on") == 0) { - return 1; - } - - return -1; -} - -__attribute__((visibility ("default"),EXTERNALLY_VISIBLE)) -int DEFAULT_SYMVER_PRE(fi_param_get)(struct fi_provider *provider, - const char *param_name, void *value) -{ - struct fi_param_entry *param; - char *str_value; - int parsed_boolean; - int ret = FI_SUCCESS; - - if (!provider) - provider = &core_prov; - - if (!param_name || !value) { - FI_DBG(provider, FI_LOG_CORE, - "Failed to read %s variable: provider coding error\n", - param_name); - return -FI_EINVAL; - } - - param = fi_find_param(provider, param_name); - if (!param) - return -FI_ENOENT; - - str_value = getenv(param->env_var_name); - if (!str_value) { - FI_INFO(provider, FI_LOG_CORE, - "variable %s=\n", param_name); - ret = -FI_ENODATA; - goto out; - } - - switch (param->type) { - case FI_PARAM_STRING: - * ((char **) value) = str_value; - FI_INFO(provider, FI_LOG_CORE, - "read string var %s=%s\n", param_name, *(char **) value); - break; - case FI_PARAM_INT: - * ((int *) value) = strtol(str_value, NULL, 0); - FI_INFO(provider, FI_LOG_CORE, - "read int var %s=%d\n", param_name, *(int *) value); - break; - case FI_PARAM_BOOL: - parsed_boolean = fi_parse_bool(str_value); - if (parsed_boolean == -1) { - ret = -FI_EINVAL; - FI_WARN(provider, FI_LOG_CORE, - "failed to parse bool var %s=%s\n", param_name, str_value); - break; - } - - * ((int *) value) = parsed_boolean; - FI_INFO(provider, FI_LOG_CORE, - "read bool var %s=%d\n", param_name, *(int *) value); - break; - case FI_PARAM_SIZE_T: - * ((size_t *) value) = strtol(str_value, NULL, 0); - FI_INFO(provider, FI_LOG_CORE, - "read long var %s=%zu\n", param_name, *(size_t *) value); - break; - } - -out: - return ret; -} -DEFAULT_SYMVER(fi_param_get_, fi_param_get, FABRIC_1.0); - - -void fi_param_init(void) -{ - dlist_init(¶m_list); -} - -void fi_param_fini(void) -{ - struct fi_param_entry *param; - struct dlist_entry *entry; - - while (!dlist_empty(¶m_list)) { - entry = param_list.next; - param = container_of(entry, struct fi_param_entry, entry); - dlist_remove(entry); - fi_free_param(param); - } -}